From 713fd67f97dee40399d64fdc3ce5069281185bd3 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Mon, 9 Sep 2002 13:31:53 +0100
Subject: Add three functions for rbtree manipulation -- rb_next(), rb_prev()
 and rb_replace_node()

rb_next() and rb_prev() return the next and previous nodes in the tree, respectively.

rb_replace_node() allows fast replacement of a single node without having to remove the
victim, rebalance the tree, insert the replacement and then rebalance again to the original
topology.
---
 include/linux/rbtree.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 96f20e145be5..27ca30a8372c 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -121,6 +121,13 @@ rb_root_t;
 extern void rb_insert_color(rb_node_t *, rb_root_t *);
 extern void rb_erase(rb_node_t *, rb_root_t *);
 
+/* Find logical next and previous nodes in a tree */
+extern rb_node_t *rb_next(rb_node_t *);
+extern rb_node_t *rb_prev(rb_node_t *);
+
+/* Fast replacement of a single node without remove/rebalance/add/rebalance */
+extern void rb_replace_node(rb_node_t *victim, rb_node_t *new, rb_root_t *root);
+
 static inline void rb_link_node(rb_node_t * node, rb_node_t * parent, rb_node_t ** rb_link)
 {
 	node->rb_parent = parent;
-- 
cgit v1.2.3


From 972d3674d680ddd89e10501cf5dcbaa2ee67e7d7 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Mon, 9 Sep 2002 14:39:12 +0100
Subject: Remove bogus rb_root_t and rb_node_t typedefs in favour of 'struct
 rb_node' and 'struct rb_root' Remove duplicate implementation of rb_next() in
 net/sched/sch_htb.c while we're at it.

---
 include/linux/mm.h     |  2 +-
 include/linux/rbtree.h | 44 ++++++++++++++++++++++----------------------
 include/linux/sched.h  |  2 +-
 lib/rbtree.c           | 45 +++++++++++++++++++++++----------------------
 mm/mmap.c              | 34 ++++++++++++++++++----------------
 net/sched/sch_htb.c    | 47 ++++++++++++++++++-----------------------------
 6 files changed, 83 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5e01743a0bc6..64b340700eaf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -54,7 +54,7 @@ struct vm_area_struct {
 	pgprot_t vm_page_prot;		/* Access permissions of this VMA. */
 	unsigned long vm_flags;		/* Flags, listed below. */
 
-	rb_node_t vm_rb;
+	struct rb_node vm_rb;
 
 	/*
 	 * For areas with an address space and backing store,
diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 27ca30a8372c..7bfb4bf7ad79 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -34,7 +34,7 @@
 static inline struct page * rb_search_page_cache(struct inode * inode,
 						 unsigned long offset)
 {
-	rb_node_t * n = inode->i_rb_page_cache.rb_node;
+	struct rb_node * n = inode->i_rb_page_cache.rb_node;
 	struct page * page;
 
 	while (n)
@@ -53,10 +53,10 @@ static inline struct page * rb_search_page_cache(struct inode * inode,
 
 static inline struct page * __rb_insert_page_cache(struct inode * inode,
 						   unsigned long offset,
-						   rb_node_t * node)
+						   struct rb_node * node)
 {
-	rb_node_t ** p = &inode->i_rb_page_cache.rb_node;
-	rb_node_t * parent = NULL;
+	struct rb_node ** p = &inode->i_rb_page_cache.rb_node;
+	struct rb_node * parent = NULL;
 	struct page * page;
 
 	while (*p)
@@ -79,7 +79,7 @@ static inline struct page * __rb_insert_page_cache(struct inode * inode,
 
 static inline struct page * rb_insert_page_cache(struct inode * inode,
 						 unsigned long offset,
-						 rb_node_t * node)
+						 struct rb_node * node)
 {
 	struct page * ret;
 	if ((ret = __rb_insert_page_cache(inode, offset, node)))
@@ -97,38 +97,38 @@ static inline struct page * rb_insert_page_cache(struct inode * inode,
 #include <linux/kernel.h>
 #include <linux/stddef.h>
 
-typedef struct rb_node_s
+struct rb_node
 {
-	struct rb_node_s * rb_parent;
+	struct rb_node *rb_parent;
 	int rb_color;
 #define	RB_RED		0
 #define	RB_BLACK	1
-	struct rb_node_s * rb_right;
-	struct rb_node_s * rb_left;
-}
-rb_node_t;
+	struct rb_node *rb_right;
+	struct rb_node *rb_left;
+};
 
-typedef struct rb_root_s
+struct rb_root
 {
-	struct rb_node_s * rb_node;
-}
-rb_root_t;
+	struct rb_node *rb_node;
+};
 
-#define RB_ROOT	(rb_root_t) { NULL, }
+#define RB_ROOT	(struct rb_root) { NULL, }
 #define	rb_entry(ptr, type, member)					\
 	((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
 
-extern void rb_insert_color(rb_node_t *, rb_root_t *);
-extern void rb_erase(rb_node_t *, rb_root_t *);
+extern void rb_insert_color(struct rb_node *, struct rb_root *);
+extern void rb_erase(struct rb_node *, struct rb_root *);
 
 /* Find logical next and previous nodes in a tree */
-extern rb_node_t *rb_next(rb_node_t *);
-extern rb_node_t *rb_prev(rb_node_t *);
+extern struct rb_node *rb_next(struct rb_node *);
+extern struct rb_node *rb_prev(struct rb_node *);
 
 /* Fast replacement of a single node without remove/rebalance/add/rebalance */
-extern void rb_replace_node(rb_node_t *victim, rb_node_t *new, rb_root_t *root);
+extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, 
+			    struct rb_root *root);
 
-static inline void rb_link_node(rb_node_t * node, rb_node_t * parent, rb_node_t ** rb_link)
+static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
+				struct rb_node ** rb_link)
 {
 	node->rb_parent = parent;
 	node->rb_color = RB_RED;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 896b7f59941c..55067b1b8062 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -173,7 +173,7 @@ struct namespace;
 struct kioctx;
 struct mm_struct {
 	struct vm_area_struct * mmap;		/* list of VMAs */
-	rb_root_t mm_rb;
+	struct rb_root mm_rb;
 	struct vm_area_struct * mmap_cache;	/* last find_vma result */
 	pgd_t * pgd;
 	atomic_t mm_users;			/* How many users with user space? */
diff --git a/lib/rbtree.c b/lib/rbtree.c
index 97e8bb448afb..91590e07ebbe 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -23,9 +23,9 @@
 #include <linux/rbtree.h>
 #include <linux/module.h>
 
-static void __rb_rotate_left(rb_node_t * node, rb_root_t * root)
+static void __rb_rotate_left(struct rb_node *node, struct rb_root *root)
 {
-	rb_node_t * right = node->rb_right;
+	struct rb_node *right = node->rb_right;
 
 	if ((node->rb_right = right->rb_left))
 		right->rb_left->rb_parent = node;
@@ -43,9 +43,9 @@ static void __rb_rotate_left(rb_node_t * node, rb_root_t * root)
 	node->rb_parent = right;
 }
 
-static void __rb_rotate_right(rb_node_t * node, rb_root_t * root)
+static void __rb_rotate_right(struct rb_node *node, struct rb_root *root)
 {
-	rb_node_t * left = node->rb_left;
+	struct rb_node *left = node->rb_left;
 
 	if ((node->rb_left = left->rb_right))
 		left->rb_right->rb_parent = node;
@@ -63,9 +63,9 @@ static void __rb_rotate_right(rb_node_t * node, rb_root_t * root)
 	node->rb_parent = left;
 }
 
-void rb_insert_color(rb_node_t * node, rb_root_t * root)
+void rb_insert_color(struct rb_node *node, struct rb_root *root)
 {
-	rb_node_t * parent, * gparent;
+	struct rb_node *parent, *gparent;
 
 	while ((parent = node->rb_parent) && parent->rb_color == RB_RED)
 	{
@@ -74,7 +74,7 @@ void rb_insert_color(rb_node_t * node, rb_root_t * root)
 		if (parent == gparent->rb_left)
 		{
 			{
-				register rb_node_t * uncle = gparent->rb_right;
+				register struct rb_node *uncle = gparent->rb_right;
 				if (uncle && uncle->rb_color == RB_RED)
 				{
 					uncle->rb_color = RB_BLACK;
@@ -87,7 +87,7 @@ void rb_insert_color(rb_node_t * node, rb_root_t * root)
 
 			if (parent->rb_right == node)
 			{
-				register rb_node_t * tmp;
+				register struct rb_node *tmp;
 				__rb_rotate_left(parent, root);
 				tmp = parent;
 				parent = node;
@@ -99,7 +99,7 @@ void rb_insert_color(rb_node_t * node, rb_root_t * root)
 			__rb_rotate_right(gparent, root);
 		} else {
 			{
-				register rb_node_t * uncle = gparent->rb_left;
+				register struct rb_node *uncle = gparent->rb_left;
 				if (uncle && uncle->rb_color == RB_RED)
 				{
 					uncle->rb_color = RB_BLACK;
@@ -112,7 +112,7 @@ void rb_insert_color(rb_node_t * node, rb_root_t * root)
 
 			if (parent->rb_left == node)
 			{
-				register rb_node_t * tmp;
+				register struct rb_node *tmp;
 				__rb_rotate_right(parent, root);
 				tmp = parent;
 				parent = node;
@@ -129,10 +129,10 @@ void rb_insert_color(rb_node_t * node, rb_root_t * root)
 }
 EXPORT_SYMBOL(rb_insert_color);
 
-static void __rb_erase_color(rb_node_t * node, rb_node_t * parent,
-			     rb_root_t * root)
+static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
+			     struct rb_root *root)
 {
-	rb_node_t * other;
+	struct rb_node *other;
 
 	while ((!node || node->rb_color == RB_BLACK) && node != root->rb_node)
 	{
@@ -160,7 +160,7 @@ static void __rb_erase_color(rb_node_t * node, rb_node_t * parent,
 				if (!other->rb_right ||
 				    other->rb_right->rb_color == RB_BLACK)
 				{
-					register rb_node_t * o_left;
+					register struct rb_node *o_left;
 					if ((o_left = other->rb_left))
 						o_left->rb_color = RB_BLACK;
 					other->rb_color = RB_RED;
@@ -200,7 +200,7 @@ static void __rb_erase_color(rb_node_t * node, rb_node_t * parent,
 				if (!other->rb_left ||
 				    other->rb_left->rb_color == RB_BLACK)
 				{
-					register rb_node_t * o_right;
+					register struct rb_node *o_right;
 					if ((o_right = other->rb_right))
 						o_right->rb_color = RB_BLACK;
 					other->rb_color = RB_RED;
@@ -221,9 +221,9 @@ static void __rb_erase_color(rb_node_t * node, rb_node_t * parent,
 		node->rb_color = RB_BLACK;
 }
 
-void rb_erase(rb_node_t * node, rb_root_t * root)
+void rb_erase(struct rb_node *node, struct rb_root *root)
 {
-	rb_node_t * child, * parent;
+	struct rb_node *child, *parent;
 	int color;
 
 	if (!node->rb_left)
@@ -232,7 +232,7 @@ void rb_erase(rb_node_t * node, rb_root_t * root)
 		child = node->rb_left;
 	else
 	{
-		rb_node_t * old = node, * left;
+		struct rb_node *old = node, *left;
 
 		node = node->rb_right;
 		while ((left = node->rb_left))
@@ -296,7 +296,7 @@ void rb_erase(rb_node_t * node, rb_root_t * root)
 }
 EXPORT_SYMBOL(rb_erase);
 
-rb_node_t *rb_next(rb_node_t *node)
+struct rb_node *rb_next(struct rb_node *node)
 {
 	/* If we have a right-hand child, go down and then left as far
 	   as we can. */
@@ -320,7 +320,7 @@ rb_node_t *rb_next(rb_node_t *node)
 }
 EXPORT_SYMBOL(rb_next);
 
-rb_node_t *rb_prev(rb_node_t *node)
+struct rb_node *rb_prev(struct rb_node *node)
 {
 	/* If we have a left-hand child, go down and then right as far
 	   as we can. */
@@ -340,9 +340,10 @@ rb_node_t *rb_prev(rb_node_t *node)
 }
 EXPORT_SYMBOL(rb_prev);
 
-void rb_replace_node(rb_node_t *victim, rb_node_t *new, rb_root_t *root)
+void rb_replace_node(struct rb_node *victim, struct rb_node *new,
+		     struct rb_root *root)
 {
-	rb_node_t *parent = victim->rb_parent;
+	struct rb_node *parent = victim->rb_parent;
 
 	/* Set the surrounding nodes to point to the replacement */
 	if (parent) {
diff --git a/mm/mmap.c b/mm/mmap.c
index 4e39ff6d91ff..c24fae0fd118 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -245,7 +245,7 @@ static inline unsigned long calc_vm_flags(unsigned long prot, unsigned long flag
 }
 
 #ifdef DEBUG_MM_RB
-static int browse_rb(rb_node_t * rb_node) {
+static int browse_rb(struct rb_node * rb_node) {
 	int i = 0;
 	if (rb_node) {
 		i++;
@@ -277,10 +277,11 @@ static void validate_mm(struct mm_struct * mm) {
 
 static struct vm_area_struct * find_vma_prepare(struct mm_struct * mm, unsigned long addr,
 						struct vm_area_struct ** pprev,
-						rb_node_t *** rb_link, rb_node_t ** rb_parent)
+						struct rb_node *** rb_link,
+						struct rb_node ** rb_parent)
 {
 	struct vm_area_struct * vma;
-	rb_node_t ** __rb_link, * __rb_parent, * rb_prev;
+	struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
 
 	__rb_link = &mm->mm_rb.rb_node;
 	rb_prev = __rb_parent = NULL;
@@ -311,8 +312,8 @@ static struct vm_area_struct * find_vma_prepare(struct mm_struct * mm, unsigned
 	return vma;
 }
 
-static inline void __vma_link_list(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev,
-				   rb_node_t * rb_parent)
+static inline void __vma_link_list(struct mm_struct * mm, struct vm_area_struct * vma,
+				   struct vm_area_struct * prev, struct rb_node * rb_parent)
 {
 	if (prev) {
 		vma->vm_next = prev->vm_next;
@@ -327,7 +328,7 @@ static inline void __vma_link_list(struct mm_struct * mm, struct vm_area_struct
 }
 
 static inline void __vma_link_rb(struct mm_struct * mm, struct vm_area_struct * vma,
-				 rb_node_t ** rb_link, rb_node_t * rb_parent)
+				 struct rb_node ** rb_link, struct rb_node * rb_parent)
 {
 	rb_link_node(&vma->vm_rb, rb_parent, rb_link);
 	rb_insert_color(&vma->vm_rb, &mm->mm_rb);
@@ -353,7 +354,7 @@ static inline void __vma_link_file(struct vm_area_struct * vma)
 }
 
 static void __vma_link(struct mm_struct * mm, struct vm_area_struct * vma,  struct vm_area_struct * prev,
-		       rb_node_t ** rb_link, rb_node_t * rb_parent)
+		       struct rb_node ** rb_link, struct rb_node * rb_parent)
 {
 	__vma_link_list(mm, vma, prev, rb_parent);
 	__vma_link_rb(mm, vma, rb_link, rb_parent);
@@ -361,7 +362,7 @@ static void __vma_link(struct mm_struct * mm, struct vm_area_struct * vma,  stru
 }
 
 static inline void vma_link(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev,
-			    rb_node_t ** rb_link, rb_node_t * rb_parent)
+			    struct rb_node ** rb_link, struct rb_node * rb_parent)
 {
 	spin_lock(&mm->page_table_lock);
 	lock_vma_mappings(vma);
@@ -374,7 +375,8 @@ static inline void vma_link(struct mm_struct * mm, struct vm_area_struct * vma,
 }
 
 static int vma_merge(struct mm_struct * mm, struct vm_area_struct * prev,
-		     rb_node_t * rb_parent, unsigned long addr, unsigned long end, unsigned long vm_flags)
+		     struct rb_node * rb_parent, unsigned long addr, 
+		     unsigned long end, unsigned long vm_flags)
 {
 	spinlock_t * lock = &mm->page_table_lock;
 	if (!prev) {
@@ -426,7 +428,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
 	unsigned int vm_flags;
 	int correct_wcount = 0;
 	int error;
-	rb_node_t ** rb_link, * rb_parent;
+	struct rb_node ** rb_link, * rb_parent;
 	unsigned long charged = 0;
 
 	if (file && (!file->f_op || !file->f_op->mmap))
@@ -698,7 +700,7 @@ struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
 		/* (Cache hit rate is typically around 35%.) */
 		vma = mm->mmap_cache;
 		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
-			rb_node_t * rb_node;
+			struct rb_node * rb_node;
 
 			rb_node = mm->mm_rb.rb_node;
 			vma = NULL;
@@ -728,7 +730,7 @@ struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
 				      struct vm_area_struct **pprev)
 {
 	struct vm_area_struct *vma = NULL, *prev = NULL;
-	rb_node_t * rb_node;
+	struct rb_node * rb_node;
 	if (!mm)
 		goto out;
 
@@ -1158,7 +1160,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	struct mm_struct * mm = current->mm;
 	struct vm_area_struct * vma, * prev;
 	unsigned long flags;
-	rb_node_t ** rb_link, * rb_parent;
+	struct rb_node ** rb_link, * rb_parent;
 
 	len = PAGE_ALIGN(len);
 	if (!len)
@@ -1236,7 +1238,7 @@ out:
 void build_mmap_rb(struct mm_struct * mm)
 {
 	struct vm_area_struct * vma;
-	rb_node_t ** rb_link, * rb_parent;
+	struct rb_node ** rb_link, * rb_parent;
 
 	mm->mm_rb = RB_ROOT;
 	rb_link = &mm->mm_rb.rb_node;
@@ -1319,7 +1321,7 @@ void exit_mmap(struct mm_struct * mm)
 void __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 {
 	struct vm_area_struct * __vma, * prev;
-	rb_node_t ** rb_link, * rb_parent;
+	struct rb_node ** rb_link, * rb_parent;
 
 	__vma = find_vma_prepare(mm, vma->vm_start, &prev, &rb_link, &rb_parent);
 	if (__vma && __vma->vm_start < vma->vm_end)
@@ -1332,7 +1334,7 @@ void __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 {
 	struct vm_area_struct * __vma, * prev;
-	rb_node_t ** rb_link, * rb_parent;
+	struct rb_node ** rb_link, * rb_parent;
 
 	__vma = find_vma_prepare(mm, vma->vm_start, &prev, &rb_link, &rb_parent);
 	if (__vma && __vma->vm_start < vma->vm_end)
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index ecc1c7c03622..d271d4defc14 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -162,12 +162,12 @@ struct htb_class
 		    struct list_head drop_list;
 	    } leaf;
 	    struct htb_class_inner {
-		    rb_root_t feed[TC_HTB_NUMPRIO];	/* feed trees */
-		    rb_node_t *ptr[TC_HTB_NUMPRIO];	/* current class ptr */
+		    struct rb_root feed[TC_HTB_NUMPRIO]; /* feed trees */
+		    struct rb_node *ptr[TC_HTB_NUMPRIO]; /* current class ptr */
 	    } inner;
     } un;
-    rb_node_t node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
-    rb_node_t pq_node;			/* node for event queue */
+    struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */
+    struct rb_node pq_node;		 /* node for event queue */
     unsigned long pq_key;	/* the same type as jiffies global */
     
     int prio_activity;		/* for which prios are we active */
@@ -207,12 +207,12 @@ struct htb_sched
     struct list_head drops[TC_HTB_NUMPRIO];	/* active leaves (for drops) */
     
     /* self list - roots of self generating tree */
-    rb_root_t row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
+    struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
     int row_mask[TC_HTB_MAXDEPTH];
-    rb_node_t *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
+    struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
 
     /* self wait list - roots of wait PQs per row */
-    rb_root_t wait_pq[TC_HTB_MAXDEPTH];
+    struct rb_root wait_pq[TC_HTB_MAXDEPTH];
 
     /* time of nearest event per level (row) */
     unsigned long near_ev_cache[TC_HTB_MAXDEPTH];
@@ -324,9 +324,9 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch)
 }
 
 #ifdef HTB_DEBUG
-static void htb_next_rb_node(rb_node_t **n);
+static void htb_next_rb_node(struct rb_node **n);
 #define HTB_DUMTREE(root,memb) if(root) { \
-	rb_node_t *n = (root)->rb_node; \
+	struct rb_node *n = (root)->rb_node; \
 	while (n->rb_left) n = n->rb_left; \
 	while (n) { \
 		struct htb_class *cl = rb_entry(n, struct htb_class, memb); \
@@ -375,10 +375,10 @@ static void htb_debug_dump (struct htb_sched *q)
  * Routine adds class to the list (actually tree) sorted by classid.
  * Make sure that class is not already on such list for given prio.
  */
-static void htb_add_to_id_tree (HTB_ARGQ rb_root_t *root,
+static void htb_add_to_id_tree (HTB_ARGQ struct rb_root *root,
 		struct htb_class *cl,int prio)
 {
-	rb_node_t **p = &root->rb_node, *parent = NULL;
+	struct rb_node **p = &root->rb_node, *parent = NULL;
 	HTB_DBG(7,3,"htb_add_id_tree cl=%X prio=%d\n",cl->classid,prio);
 #ifdef HTB_DEBUG
 	if (cl->node[prio].rb_color != -1) { BUG_TRAP(0); return; }
@@ -411,7 +411,7 @@ static void htb_add_to_id_tree (HTB_ARGQ rb_root_t *root,
 static void htb_add_to_wait_tree (struct htb_sched *q,
 		struct htb_class *cl,long delay,int debug_hint)
 {
-	rb_node_t **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
+	struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
 	HTB_DBG(7,3,"htb_add_wt cl=%X key=%lu\n",cl->classid,cl->pq_key);
 #ifdef HTB_DEBUG
 	if (cl->pq_node.rb_color != -1) { BUG_TRAP(0); return; }
@@ -447,20 +447,9 @@ static void htb_add_to_wait_tree (struct htb_sched *q,
  * When we are past last key we return NULL.
  * Average complexity is 2 steps per call.
  */
-static void htb_next_rb_node(rb_node_t **n)
+static void htb_next_rb_node(struct rb_node **n)
 {
-	rb_node_t *p;
-	if ((*n)->rb_right) {
-		*n = (*n)->rb_right;
-		while ((*n)->rb_left) 
-			*n = (*n)->rb_left;
-		return;
-	}
-	while ((p = (*n)->rb_parent) != NULL) {
-		if (p->rb_left == *n) break;
-		*n = p;
-	}
-	*n = p;
+	*n = rb_next(*n);
 }
 
 /**
@@ -869,7 +858,7 @@ static long htb_do_events(struct htb_sched *q,int level)
 	for (i = 0; i < 500; i++) {
 		struct htb_class *cl;
 		long diff;
-		rb_node_t *p = q->wait_pq[level].rb_node;
+		struct rb_node *p = q->wait_pq[level].rb_node;
 		if (!p) return 0;
 		while (p->rb_left) p = p->rb_left;
 
@@ -906,12 +895,12 @@ static long htb_do_events(struct htb_sched *q,int level)
  * Find leaf where current feed pointers points to.
  */
 static struct htb_class *
-htb_lookup_leaf(rb_root_t *tree,int prio,rb_node_t **pptr)
+htb_lookup_leaf(struct rb_root *tree,int prio,struct rb_node **pptr)
 {
 	int i;
 	struct {
-		rb_node_t *root;
-		rb_node_t **pptr;
+		struct rb_node *root;
+		struct rb_node **pptr;
 	} stk[TC_HTB_MAXDEPTH],*sp = stk;
 	
 	sp->root = tree->rb_node;
-- 
cgit v1.2.3


From ef5cc2fd95520561f7e3c8c49b809000dee033ba Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@dhcp212.munich.sgi.com>
Date: Wed, 11 Sep 2002 06:15:32 -0400
Subject: Import XFS CVS from 10092002

---
 Documentation/Changes              |   16 +
 Documentation/filesystems/00-INDEX |    2 +
 Documentation/filesystems/xfs.txt  |  118 +
 MAINTAINERS                        |    8 +
 fs/Config.help                     |   47 +
 fs/Config.in                       |    7 +
 fs/Makefile                        |    1 +
 fs/xfs/Makefile                    |  160 +
 fs/xfs/linux/xfs_aops.c            |  949 ++++++
 fs/xfs/linux/xfs_behavior.c        |  199 ++
 fs/xfs/linux/xfs_behavior.h        |  211 ++
 fs/xfs/linux/xfs_cred.h            |   50 +
 fs/xfs/linux/xfs_file.c            |  334 ++
 fs/xfs/linux/xfs_fs_subr.c         |  169 +
 fs/xfs/linux/xfs_fs_subr.h         |   50 +
 fs/xfs/linux/xfs_globals.c         |   71 +
 fs/xfs/linux/xfs_globals.h         |   50 +
 fs/xfs/linux/xfs_ioctl.c           | 1073 ++++++
 fs/xfs/linux/xfs_iops.c            |  875 +++++
 fs/xfs/linux/xfs_iops.h            |   71 +
 fs/xfs/linux/xfs_linux.h           |  290 ++
 fs/xfs/linux/xfs_lrw.c             | 1813 +++++++++++
 fs/xfs/linux/xfs_lrw.h             |   75 +
 fs/xfs/linux/xfs_stats.c           |  132 +
 fs/xfs/linux/xfs_stats.h           |   46 +
 fs/xfs/linux/xfs_super.c           |  888 +++++
 fs/xfs/linux/xfs_super.h           |   97 +
 fs/xfs/linux/xfs_sysctl.c          |  135 +
 fs/xfs/linux/xfs_sysctl.h          |   66 +
 fs/xfs/linux/xfs_version.h         |   44 +
 fs/xfs/linux/xfs_vfs.h             |  208 ++
 fs/xfs/linux/xfs_vnode.c           |  468 +++
 fs/xfs/linux/xfs_vnode.h           |  765 +++++
 fs/xfs/pagebuf/page_buf.c          | 2154 ++++++++++++
 fs/xfs/pagebuf/page_buf.h          |  408 +++
 fs/xfs/pagebuf/page_buf_internal.h |  170 +
 fs/xfs/pagebuf/page_buf_locking.c  |  228 ++
 fs/xfs/pagebuf/page_buf_trace.h    |   95 +
 fs/xfs/support/atomic.h            |   63 +
 fs/xfs/support/debug.c             |  126 +
 fs/xfs/support/debug.h             |   67 +
 fs/xfs/support/kmem.c              |  257 ++
 fs/xfs/support/kmem.h              |   62 +
 fs/xfs/support/ktrace.c            |  374 +++
 fs/xfs/support/ktrace.h            |  105 +
 fs/xfs/support/move.c              |   92 +
 fs/xfs/support/move.h              |   71 +
 fs/xfs/support/mrlock.c            |  288 ++
 fs/xfs/support/mrlock.h            |   86 +
 fs/xfs/support/mutex.h             |   55 +
 fs/xfs/support/qsort.c             |  243 ++
 fs/xfs/support/qsort.h             |   41 +
 fs/xfs/support/sema.h              |   68 +
 fs/xfs/support/spin.h              |   74 +
 fs/xfs/support/sv.h                |   91 +
 fs/xfs/support/time.h              |   49 +
 fs/xfs/support/uuid.c              |  238 ++
 fs/xfs/support/uuid.h              |   41 +
 fs/xfs/xfs.h                       |  114 +
 fs/xfs/xfs_acl.c                   |  893 +++++
 fs/xfs/xfs_acl.h                   |  120 +
 fs/xfs/xfs_ag.h                    |  377 +++
 fs/xfs/xfs_alloc.c                 | 2639 +++++++++++++++
 fs/xfs/xfs_alloc.h                 |  223 ++
 fs/xfs/xfs_alloc_btree.c           | 2155 ++++++++++++
 fs/xfs/xfs_alloc_btree.h           |  251 ++
 fs/xfs/xfs_arch.h                  |  274 ++
 fs/xfs/xfs_attr.c                  | 2294 +++++++++++++
 fs/xfs/xfs_attr.h                  |  160 +
 fs/xfs/xfs_attr_fetch.c            |   71 +
 fs/xfs/xfs_attr_leaf.c             | 2971 +++++++++++++++++
 fs/xfs/xfs_attr_leaf.h             |  305 ++
 fs/xfs/xfs_attr_sf.h               |  157 +
 fs/xfs/xfs_bit.c                   |  307 ++
 fs/xfs/xfs_bit.h                   |   85 +
 fs/xfs/xfs_bmap.c                  | 6323 ++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_bmap.h                  |  397 +++
 fs/xfs/xfs_bmap_btree.c            | 2634 +++++++++++++++
 fs/xfs/xfs_bmap_btree.h            |  656 ++++
 fs/xfs/xfs_btree.c                 |  937 ++++++
 fs/xfs/xfs_btree.h                 |  587 ++++
 fs/xfs/xfs_buf.h                   |  319 ++
 fs/xfs/xfs_buf_item.c              | 1203 +++++++
 fs/xfs/xfs_buf_item.h              |  178 +
 fs/xfs/xfs_cap.c                   |  202 ++
 fs/xfs/xfs_cap.h                   |   84 +
 fs/xfs/xfs_clnt.h                  |  131 +
 fs/xfs/xfs_da_btree.c              | 2579 +++++++++++++++
 fs/xfs/xfs_da_btree.h              |  343 ++
 fs/xfs/xfs_dfrag.c                 |  364 +++
 fs/xfs/xfs_dfrag.h                 |   67 +
 fs/xfs/xfs_dinode.h                |  480 +++
 fs/xfs/xfs_dir.c                   | 1183 +++++++
 fs/xfs/xfs_dir.h                   |  162 +
 fs/xfs/xfs_dir2.c                  |  830 +++++
 fs/xfs/xfs_dir2.h                  |  110 +
 fs/xfs/xfs_dir2_block.c            | 1231 +++++++
 fs/xfs/xfs_dir2_block.h            |  128 +
 fs/xfs/xfs_dir2_data.c             |  833 +++++
 fs/xfs/xfs_dir2_data.h             |  232 ++
 fs/xfs/xfs_dir2_leaf.c             | 1873 +++++++++++
 fs/xfs/xfs_dir2_leaf.h             |  361 ++
 fs/xfs/xfs_dir2_node.c             | 1976 +++++++++++
 fs/xfs/xfs_dir2_node.h             |  160 +
 fs/xfs/xfs_dir2_sf.c               | 1320 ++++++++
 fs/xfs/xfs_dir2_sf.h               |  256 ++
 fs/xfs/xfs_dir2_trace.c            |  202 ++
 fs/xfs/xfs_dir2_trace.h            |   95 +
 fs/xfs/xfs_dir_leaf.c              | 2224 +++++++++++++
 fs/xfs/xfs_dir_leaf.h              |  254 ++
 fs/xfs/xfs_dir_sf.h                |  188 ++
 fs/xfs/xfs_dmapi.h                 |  322 ++
 fs/xfs/xfs_dqblk.h                 |   99 +
 fs/xfs/xfs_dquot.c                 | 1660 ++++++++++
 fs/xfs/xfs_dquot.h                 |  212 ++
 fs/xfs/xfs_dquot_item.c            |  675 ++++
 fs/xfs/xfs_dquot_item.h            |  104 +
 fs/xfs/xfs_error.c                 |  236 ++
 fs/xfs/xfs_error.h                 |  156 +
 fs/xfs/xfs_extfree_item.c          |  655 ++++
 fs/xfs/xfs_extfree_item.h          |  123 +
 fs/xfs/xfs_fs.h                    |  510 +++
 fs/xfs/xfs_fsops.c                 |  598 ++++
 fs/xfs/xfs_fsops.h                 |   70 +
 fs/xfs/xfs_ialloc.c                | 1336 ++++++++
 fs/xfs/xfs_ialloc.h                |  181 ++
 fs/xfs/xfs_ialloc_btree.c          | 2104 ++++++++++++
 fs/xfs/xfs_ialloc_btree.h          |  318 ++
 fs/xfs/xfs_iget.c                  | 1040 ++++++
 fs/xfs/xfs_imap.h                  |   54 +
 fs/xfs/xfs_inode.c                 | 3626 +++++++++++++++++++++
 fs/xfs/xfs_inode.h                 |  582 ++++
 fs/xfs/xfs_inode_item.c            | 1024 ++++++
 fs/xfs/xfs_inode_item.h            |  193 ++
 fs/xfs/xfs_inum.h                  |  173 +
 fs/xfs/xfs_iocore.c                |   86 +
 fs/xfs/xfs_itable.c                |  770 +++++
 fs/xfs/xfs_itable.h                |  102 +
 fs/xfs/xfs_log.c                   | 3628 +++++++++++++++++++++
 fs/xfs/xfs_log.h                   |  191 ++
 fs/xfs/xfs_log_priv.h              |  562 ++++
 fs/xfs/xfs_log_recover.c           | 3702 +++++++++++++++++++++
 fs/xfs/xfs_log_recover.h           |   81 +
 fs/xfs/xfs_mac.c                   |   72 +
 fs/xfs/xfs_mac.h                   |  120 +
 fs/xfs/xfs_macros.c                | 2228 +++++++++++++
 fs/xfs/xfs_macros.h                |  105 +
 fs/xfs/xfs_mount.c                 | 1737 ++++++++++
 fs/xfs/xfs_mount.h                 |  453 +++
 fs/xfs/xfs_qm.c                    | 2899 +++++++++++++++++
 fs/xfs/xfs_qm.h                    |  219 ++
 fs/xfs/xfs_qm_syscalls.c           | 1465 +++++++++
 fs/xfs/xfs_quota.h                 |  351 ++
 fs/xfs/xfs_quota_priv.h            |  192 ++
 fs/xfs/xfs_rename.c                |  679 ++++
 fs/xfs/xfs_rtalloc.c               | 2438 ++++++++++++++
 fs/xfs/xfs_rtalloc.h               |  185 ++
 fs/xfs/xfs_rw.c                    |  753 +++++
 fs/xfs/xfs_rw.h                    |  219 ++
 fs/xfs/xfs_sb.h                    |  503 +++
 fs/xfs/xfs_trans.c                 | 1268 ++++++++
 fs/xfs/xfs_trans.h                 | 1034 ++++++
 fs/xfs/xfs_trans_ail.c             |  588 ++++
 fs/xfs/xfs_trans_buf.c             | 1085 +++++++
 fs/xfs/xfs_trans_dquot.c           |  852 +++++
 fs/xfs/xfs_trans_extfree.c         |  150 +
 fs/xfs/xfs_trans_inode.c           |  433 +++
 fs/xfs/xfs_trans_item.c            |  557 ++++
 fs/xfs/xfs_trans_priv.h            |   73 +
 fs/xfs/xfs_trans_space.h           |  105 +
 fs/xfs/xfs_types.h                 |  330 ++
 fs/xfs/xfs_utils.c                 |  519 +++
 fs/xfs/xfs_utils.h                 |  105 +
 fs/xfs/xfs_vfsops.c                | 1669 ++++++++++
 fs/xfs/xfs_vnodeops.c              | 5265 ++++++++++++++++++++++++++++++
 include/linux/sched.h              |    1 +
 include/linux/sysctl.h             |    2 +
 kernel/ksyms.c                     |    1 +
 178 files changed, 113360 insertions(+)
 create mode 100644 Documentation/filesystems/xfs.txt
 create mode 100644 fs/xfs/Makefile
 create mode 100644 fs/xfs/linux/xfs_aops.c
 create mode 100644 fs/xfs/linux/xfs_behavior.c
 create mode 100644 fs/xfs/linux/xfs_behavior.h
 create mode 100644 fs/xfs/linux/xfs_cred.h
 create mode 100644 fs/xfs/linux/xfs_file.c
 create mode 100644 fs/xfs/linux/xfs_fs_subr.c
 create mode 100644 fs/xfs/linux/xfs_fs_subr.h
 create mode 100644 fs/xfs/linux/xfs_globals.c
 create mode 100644 fs/xfs/linux/xfs_globals.h
 create mode 100644 fs/xfs/linux/xfs_ioctl.c
 create mode 100644 fs/xfs/linux/xfs_iops.c
 create mode 100644 fs/xfs/linux/xfs_iops.h
 create mode 100644 fs/xfs/linux/xfs_linux.h
 create mode 100644 fs/xfs/linux/xfs_lrw.c
 create mode 100644 fs/xfs/linux/xfs_lrw.h
 create mode 100644 fs/xfs/linux/xfs_stats.c
 create mode 100644 fs/xfs/linux/xfs_stats.h
 create mode 100644 fs/xfs/linux/xfs_super.c
 create mode 100644 fs/xfs/linux/xfs_super.h
 create mode 100644 fs/xfs/linux/xfs_sysctl.c
 create mode 100644 fs/xfs/linux/xfs_sysctl.h
 create mode 100644 fs/xfs/linux/xfs_version.h
 create mode 100644 fs/xfs/linux/xfs_vfs.h
 create mode 100644 fs/xfs/linux/xfs_vnode.c
 create mode 100644 fs/xfs/linux/xfs_vnode.h
 create mode 100644 fs/xfs/pagebuf/page_buf.c
 create mode 100644 fs/xfs/pagebuf/page_buf.h
 create mode 100644 fs/xfs/pagebuf/page_buf_internal.h
 create mode 100644 fs/xfs/pagebuf/page_buf_locking.c
 create mode 100644 fs/xfs/pagebuf/page_buf_trace.h
 create mode 100644 fs/xfs/support/atomic.h
 create mode 100644 fs/xfs/support/debug.c
 create mode 100644 fs/xfs/support/debug.h
 create mode 100644 fs/xfs/support/kmem.c
 create mode 100644 fs/xfs/support/kmem.h
 create mode 100644 fs/xfs/support/ktrace.c
 create mode 100644 fs/xfs/support/ktrace.h
 create mode 100644 fs/xfs/support/move.c
 create mode 100644 fs/xfs/support/move.h
 create mode 100644 fs/xfs/support/mrlock.c
 create mode 100644 fs/xfs/support/mrlock.h
 create mode 100644 fs/xfs/support/mutex.h
 create mode 100644 fs/xfs/support/qsort.c
 create mode 100644 fs/xfs/support/qsort.h
 create mode 100644 fs/xfs/support/sema.h
 create mode 100644 fs/xfs/support/spin.h
 create mode 100644 fs/xfs/support/sv.h
 create mode 100644 fs/xfs/support/time.h
 create mode 100644 fs/xfs/support/uuid.c
 create mode 100644 fs/xfs/support/uuid.h
 create mode 100644 fs/xfs/xfs.h
 create mode 100644 fs/xfs/xfs_acl.c
 create mode 100644 fs/xfs/xfs_acl.h
 create mode 100644 fs/xfs/xfs_ag.h
 create mode 100644 fs/xfs/xfs_alloc.c
 create mode 100644 fs/xfs/xfs_alloc.h
 create mode 100644 fs/xfs/xfs_alloc_btree.c
 create mode 100644 fs/xfs/xfs_alloc_btree.h
 create mode 100644 fs/xfs/xfs_arch.h
 create mode 100644 fs/xfs/xfs_attr.c
 create mode 100644 fs/xfs/xfs_attr.h
 create mode 100644 fs/xfs/xfs_attr_fetch.c
 create mode 100644 fs/xfs/xfs_attr_leaf.c
 create mode 100644 fs/xfs/xfs_attr_leaf.h
 create mode 100644 fs/xfs/xfs_attr_sf.h
 create mode 100644 fs/xfs/xfs_bit.c
 create mode 100644 fs/xfs/xfs_bit.h
 create mode 100644 fs/xfs/xfs_bmap.c
 create mode 100644 fs/xfs/xfs_bmap.h
 create mode 100644 fs/xfs/xfs_bmap_btree.c
 create mode 100644 fs/xfs/xfs_bmap_btree.h
 create mode 100644 fs/xfs/xfs_btree.c
 create mode 100644 fs/xfs/xfs_btree.h
 create mode 100644 fs/xfs/xfs_buf.h
 create mode 100644 fs/xfs/xfs_buf_item.c
 create mode 100644 fs/xfs/xfs_buf_item.h
 create mode 100644 fs/xfs/xfs_cap.c
 create mode 100644 fs/xfs/xfs_cap.h
 create mode 100644 fs/xfs/xfs_clnt.h
 create mode 100644 fs/xfs/xfs_da_btree.c
 create mode 100644 fs/xfs/xfs_da_btree.h
 create mode 100644 fs/xfs/xfs_dfrag.c
 create mode 100644 fs/xfs/xfs_dfrag.h
 create mode 100644 fs/xfs/xfs_dinode.h
 create mode 100644 fs/xfs/xfs_dir.c
 create mode 100644 fs/xfs/xfs_dir.h
 create mode 100644 fs/xfs/xfs_dir2.c
 create mode 100644 fs/xfs/xfs_dir2.h
 create mode 100644 fs/xfs/xfs_dir2_block.c
 create mode 100644 fs/xfs/xfs_dir2_block.h
 create mode 100644 fs/xfs/xfs_dir2_data.c
 create mode 100644 fs/xfs/xfs_dir2_data.h
 create mode 100644 fs/xfs/xfs_dir2_leaf.c
 create mode 100644 fs/xfs/xfs_dir2_leaf.h
 create mode 100644 fs/xfs/xfs_dir2_node.c
 create mode 100644 fs/xfs/xfs_dir2_node.h
 create mode 100644 fs/xfs/xfs_dir2_sf.c
 create mode 100644 fs/xfs/xfs_dir2_sf.h
 create mode 100644 fs/xfs/xfs_dir2_trace.c
 create mode 100644 fs/xfs/xfs_dir2_trace.h
 create mode 100644 fs/xfs/xfs_dir_leaf.c
 create mode 100644 fs/xfs/xfs_dir_leaf.h
 create mode 100644 fs/xfs/xfs_dir_sf.h
 create mode 100644 fs/xfs/xfs_dmapi.h
 create mode 100644 fs/xfs/xfs_dqblk.h
 create mode 100644 fs/xfs/xfs_dquot.c
 create mode 100644 fs/xfs/xfs_dquot.h
 create mode 100644 fs/xfs/xfs_dquot_item.c
 create mode 100644 fs/xfs/xfs_dquot_item.h
 create mode 100644 fs/xfs/xfs_error.c
 create mode 100644 fs/xfs/xfs_error.h
 create mode 100644 fs/xfs/xfs_extfree_item.c
 create mode 100644 fs/xfs/xfs_extfree_item.h
 create mode 100644 fs/xfs/xfs_fs.h
 create mode 100644 fs/xfs/xfs_fsops.c
 create mode 100644 fs/xfs/xfs_fsops.h
 create mode 100644 fs/xfs/xfs_ialloc.c
 create mode 100644 fs/xfs/xfs_ialloc.h
 create mode 100644 fs/xfs/xfs_ialloc_btree.c
 create mode 100644 fs/xfs/xfs_ialloc_btree.h
 create mode 100644 fs/xfs/xfs_iget.c
 create mode 100644 fs/xfs/xfs_imap.h
 create mode 100644 fs/xfs/xfs_inode.c
 create mode 100644 fs/xfs/xfs_inode.h
 create mode 100644 fs/xfs/xfs_inode_item.c
 create mode 100644 fs/xfs/xfs_inode_item.h
 create mode 100644 fs/xfs/xfs_inum.h
 create mode 100644 fs/xfs/xfs_iocore.c
 create mode 100644 fs/xfs/xfs_itable.c
 create mode 100644 fs/xfs/xfs_itable.h
 create mode 100644 fs/xfs/xfs_log.c
 create mode 100644 fs/xfs/xfs_log.h
 create mode 100644 fs/xfs/xfs_log_priv.h
 create mode 100644 fs/xfs/xfs_log_recover.c
 create mode 100644 fs/xfs/xfs_log_recover.h
 create mode 100644 fs/xfs/xfs_mac.c
 create mode 100644 fs/xfs/xfs_mac.h
 create mode 100644 fs/xfs/xfs_macros.c
 create mode 100644 fs/xfs/xfs_macros.h
 create mode 100644 fs/xfs/xfs_mount.c
 create mode 100644 fs/xfs/xfs_mount.h
 create mode 100644 fs/xfs/xfs_qm.c
 create mode 100644 fs/xfs/xfs_qm.h
 create mode 100644 fs/xfs/xfs_qm_syscalls.c
 create mode 100644 fs/xfs/xfs_quota.h
 create mode 100644 fs/xfs/xfs_quota_priv.h
 create mode 100644 fs/xfs/xfs_rename.c
 create mode 100644 fs/xfs/xfs_rtalloc.c
 create mode 100644 fs/xfs/xfs_rtalloc.h
 create mode 100644 fs/xfs/xfs_rw.c
 create mode 100644 fs/xfs/xfs_rw.h
 create mode 100644 fs/xfs/xfs_sb.h
 create mode 100644 fs/xfs/xfs_trans.c
 create mode 100644 fs/xfs/xfs_trans.h
 create mode 100644 fs/xfs/xfs_trans_ail.c
 create mode 100644 fs/xfs/xfs_trans_buf.c
 create mode 100644 fs/xfs/xfs_trans_dquot.c
 create mode 100644 fs/xfs/xfs_trans_extfree.c
 create mode 100644 fs/xfs/xfs_trans_inode.c
 create mode 100644 fs/xfs/xfs_trans_item.c
 create mode 100644 fs/xfs/xfs_trans_priv.h
 create mode 100644 fs/xfs/xfs_trans_space.h
 create mode 100644 fs/xfs/xfs_types.h
 create mode 100644 fs/xfs/xfs_utils.c
 create mode 100644 fs/xfs/xfs_utils.h
 create mode 100644 fs/xfs/xfs_vfsops.c
 create mode 100644 fs/xfs/xfs_vnodeops.c

(limited to 'include/linux')

diff --git a/Documentation/Changes b/Documentation/Changes
index 005c77815704..378c5cc7675c 100644
--- a/Documentation/Changes
+++ b/Documentation/Changes
@@ -56,6 +56,7 @@ o  modutils               2.4.2                   # insmod -V
 o  e2fsprogs              1.25                    # tune2fs
 o  jfsutils               1.0.14                  # fsck.jfs -V
 o  reiserfsprogs          3.x.1b                  # reiserfsck 2>&1|grep reiserfsprogs
+o  xfsprogs               2.1.0                   # xfs_db -V
 o  pcmcia-cs              3.1.21                  # cardmgr -V
 o  PPP                    2.4.0                   # pppd --version
 o  isdn4k-utils           3.1pre1                 # isdnctrl 2>&1|grep version
@@ -182,6 +183,17 @@ The reiserfsprogs package should be used for reiserfs-3.6.x
 versions of mkreiserfs, resize_reiserfs, debugreiserfs and
 reiserfsck. These utils work on both i386 and alpha platforms.
 
+Xfsprogs
+--------
+
+The latest version of xfsprogs contains mkfs.xfs, xfs_db, and the
+xfs_repair utilities, among others, for the XFS filesystem.  It is
+architecture independent and any version from 2.0.0 onward should
+work correctly with this version of the XFS kernel code.  For the new
+(v2) log format that has better support for stripe-size aligning on
+LVM and MD devices at least xfsprogs 2.1.0 is needed.
+
+
 Pcmcia-cs
 ---------
 
@@ -316,6 +328,10 @@ Reiserfsprogs
 -------------
 o  <ftp://ftp.namesys.com/pub/reiserfsprogs/reiserfsprogs-3.x.1b.tar.gz>
 
+Xfsprogs
+--------
+o  <ftp://oss.sgi.com/projects/xfs/download/cmd_tars/xfsprogs-2.1.0.src.tar.gz>
+
 LVM toolset
 -----------
 o  <http://www.sistina.com/lvm/>
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index f3c4cf5d2464..fd912db9bbc8 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -46,3 +46,5 @@ vfat.txt
 	- info on using the VFAT filesystem used in Windows NT and Windows 95
 vfs.txt
 	- Overview of the Virtual File System
+xfs.txt
+	- info and mount options for the XFS filesystem.
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
new file mode 100644
index 000000000000..166ef3d78898
--- /dev/null
+++ b/Documentation/filesystems/xfs.txt
@@ -0,0 +1,118 @@
+
+The SGI XFS Filesystem
+======================
+
+XFS is a high performance journaling filesystem which originated
+on the SGI IRIX platform.  It is completely multi-threaded, can
+support large files and large filesystems, extended attributes,
+variable block sizes, is extent based, and makes extensive use of
+Btrees (directories, extents, free space) to aid both performance
+and scalability.
+
+Refer to the documentation at http://oss.sgi.com/projects/xfs/
+for further details.  This implementation is on-disk compatible
+with the IRIX version of XFS.
+
+
+Options
+=======
+
+When mounting an XFS filesystem, the following options are accepted.
+
+  biosize=size
+	Sets the preferred buffered I/O size (default size is 64K).
+	"size" must be expressed as the logarithm (base2) of the
+	desired I/O size.
+	Valid values for this option are 14 through 16, inclusive
+	(i.e. 16K, 32K, and 64K bytes).  On machines with a 4K
+	pagesize, 13 (8K bytes) is also a valid size.
+	The preferred buffered I/O size can also be altered on an
+	individual file basis using the ioctl(2) system call.
+
+  dmapi/xdsm
+	Enable the DMAPI (Data Management API) event callouts.
+
+  irixsgid
+	Do not inherit the ISGID bit on subdirectories of ISGID
+	directories, if the process creating the subdirectory
+	is not a member of the parent directory group ID.
+	This matches IRIX behavior.
+
+  logbufs=value
+	Set the number of in-memory log buffers.  Valid numbers range
+	from 2-8 inclusive.
+	The default value is 8 buffers for filesystems with a
+	blocksize of 64K, 4 buffers for filesystems with a blocksize
+	of 32K, 3 buffers for filesystems with a blocksize of 16K
+	and 2 buffers for all other configurations.  Increasing the
+	number of buffers may increase performance on some workloads
+	at the cost of the memory used for the additional log buffers
+	and their associated control structures.
+
+  logbsize=value
+	Set the size of each in-memory log buffer.
+	Size may be specified in bytes, or in kilobytes with a "k" suffix.
+	Valid sizes for version 1 and version 2 logs are 16384 (16k) and 
+	32768 (32k).  Valid sizes for version 2 logs also include 
+	65536 (64k), 131072 (128k) and 262144 (256k).
+	The default value for machines with more than 32MB of memory
+	is 32768, machines with less memory use 16384 by default.
+
+  logdev=device and rtdev=device
+	Use an external log (metadata journal) and/or real-time device.
+	An XFS filesystem has up to three parts: a data section, a log
+	section, and a real-time section.  The real-time section is
+	optional, and the log section can be separate from the data
+	section or contained within it.
+
+  noalign
+	Data allocations will not be aligned at stripe unit boundaries.
+
+  noatime
+	Access timestamps are not updated when a file is read.
+
+  norecovery
+	The filesystem will be mounted without running log recovery.
+	If the filesystem was not cleanly unmounted, it is likely to
+	be inconsistent when mounted in "norecovery" mode.
+	Some files or directories may not be accessible because of this.
+	Filesystems mounted "norecovery" must be mounted read-only or
+	the mount will fail.
+
+  osyncisosync
+	Make O_SYNC writes implement true O_SYNC.  WITHOUT this option,
+	Linux XFS behaves as if an "osyncisdsync" option is used,
+	which will make writes to files opened with the O_SYNC flag set
+	behave as if the O_DSYNC flag had been used instead.
+	This can result in better performance without compromising
+	data safety.
+	However if this option is not in effect, timestamp updates from
+	O_SYNC writes can be lost if the system crashes.
+	If timestamp updates are critical, use the osyncisosync option.
+
+  quota/usrquota/uqnoenforce
+	User disk quota accounting enabled, and limits (optionally)
+	enforced.
+
+  grpquota/gqnoenforce
+	Group disk quota accounting enabled and limits (optionally)
+	enforced.
+
+  sunit=value and swidth=value
+	Used to specify the stripe unit and width for a RAID device or
+	a stripe volume.  "value" must be specified in 512-byte block
+	units.
+	If this option is not specified and the filesystem was made on
+	a stripe volume or the stripe width or unit were specified for
+	the RAID device at mkfs time, then the mount system call will
+	restore the value from the superblock.  For filesystems that
+	are made directly on RAID devices, these options can be used
+	to override the information in the superblock if the underlying
+	disk layout changes after the filesystem has been created.
+	The "swidth" option is required if the "sunit" option has been
+	specified, and must be a multiple of the "sunit" value.
+
+  nouuid
+        Don't check for double mounted file systems using the file system uuid.
+        This is useful to mount LVM snapshot volumes.
+
diff --git a/MAINTAINERS b/MAINTAINERS
index 54a0866c7e6e..ef0116d63dcb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1912,6 +1912,14 @@ M:	eis@baty.hanse.de
 L:	linux-x25@vger.kernel.org
 S:	Maintained
 
+XFS FILESYSTEM
+P:	Silicon Graphics Inc
+M:	owner-xfs@oss.sgi.com
+M:	lord@sgi.com
+L:	linux-xfs@oss.sgi.com
+W:	http://oss.sgi.com/projects/xfs
+S:	Supported
+
 X86 3-LEVEL PAGING (PAE) SUPPORT
 P:	Ingo Molnar
 M:	mingo@redhat.com
diff --git a/fs/Config.help b/fs/Config.help
index ad1c8ede99cf..218c8baa5ce0 100644
--- a/fs/Config.help
+++ b/fs/Config.help
@@ -1036,3 +1036,50 @@ CONFIG_NCPFS_NLS
 
   To select codepages and I/O charsets use ncpfs-2.2.0.13 or newer.
 
+CONFIG_XFS_FS
+  XFS is a high performance journaling filesystem which originated
+  on the SGI IRIX platform.  It is completely multi-threaded, can
+  support large files and large filesystems, extended attributes,
+  variable block sizes, is extent based, and makes extensive use of
+  Btrees (directories, extents, free space) to aid both performance
+  and scalability.
+
+  Refer to the documentation at <http://oss.sgi.com/projects/xfs/>
+  for complete details.  This implementation is on-disk compatible
+  with the IRIX version of XFS.
+
+  If you want to compile this file system as a module ( = code which
+  can be inserted in and removed from the running kernel whenever you
+  want), say M here and read <file:Documentation/modules.txt>. The
+  module will be called xfs.o.  Be aware, however, that if the file
+  system of your root partition is compiled as a module, you'll need
+  to use an initial ramdisk (initrd) to boot.
+
+CONFIG_XFS_QUOTA
+  If you say Y here, you will be able to set limits for disk usage on
+  a per user and/or a per group basis under XFS.  XFS considers quota
+  information as filesystem metadata and uses journaling to provide a
+  higher level guarantee of consistency.  The on-disk data format for
+  quota is also compatible with the IRIX version of XFS, allowing a
+  filesystem to be migrated between Linux and IRIX without any need
+  for conversion.
+
+  If unsure, say N.  More comprehensive documentation can be found in
+  README.quota in the xfsprogs package.  XFS quota can be used either
+  with or without the generic quota support enabled (CONFIG_QUOTA) -
+  they are completely independent subsystems.
+
+CONFIG_XFS_RT
+  If you say Y here you will be able to mount and use XFS filesystems
+  which contain a realtime subvolume. The realtime subvolume is a
+  separate area of disk space where only file data is stored. The
+  realtime subvolume is designed to provide very deterministic
+  data rates suitable for media streaming applications.
+
+  See the xfs man page in section 5 for a bit more information.
+
+  This feature is unsupported at this time, is not yet fully
+  functional, and may cause serious problems.
+
+  If unsure, say N.
+
diff --git a/fs/Config.in b/fs/Config.in
index 85e171bfe1f7..13f291114074 100644
--- a/fs/Config.in
+++ b/fs/Config.in
@@ -101,6 +101,13 @@ dep_mbool '  UDF write support (DANGEROUS)' CONFIG_UDF_RW $CONFIG_UDF_FS $CONFIG
 tristate 'UFS file system support (read only)' CONFIG_UFS_FS
 dep_mbool '  UFS file system write support (DANGEROUS)' CONFIG_UFS_FS_WRITE $CONFIG_UFS_FS $CONFIG_EXPERIMENTAL
 
+tristate 'XFS filesystem support' CONFIG_XFS_FS
+dep_mbool    '  Realtime support (EXPERIMENTAL)' CONFIG_XFS_RT $CONFIG_XFS_FS $CONFIG_EXPERIMENTAL
+dep_mbool    '  Quota support' CONFIG_XFS_QUOTA $CONFIG_XFS_FS
+if [ "$CONFIG_XFS_QUOTA" = "y" ]; then
+   define_bool CONFIG_QUOTACTL y
+fi
+
 if [ "$CONFIG_NET" = "y" ]; then
 
    mainmenu_option next_comment
diff --git a/fs/Makefile b/fs/Makefile
index 1a2f9a2dc9e6..eaccee26c5b3 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -84,5 +84,6 @@ obj-$(CONFIG_ADFS_FS)		+= adfs/
 obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
 obj-$(CONFIG_SUN_OPENPROMFS)	+= openpromfs/
 obj-$(CONFIG_JFS_FS)		+= jfs/
+obj-$(CONFIG_XFS_FS)		+= xfs/
 
 include $(TOPDIR)/Rules.make
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
new file mode 100644
index 000000000000..e44e49f4d336
--- /dev/null
+++ b/fs/xfs/Makefile
@@ -0,0 +1,160 @@
+#
+# Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Further, this software is distributed without any warranty that it is
+# free of the rightful claim of any third person regarding infringement
+# or the like.	Any license provided herein, whether implied or
+# otherwise, applies only to this software file.  Patent licenses, if
+# any, provided herein do not apply to combinations of this program with
+# other software, or any other product whatsoever.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write the Free Software Foundation, Inc., 59
+# Temple Place - Suite 330, Boston MA 02111-1307, USA.
+#
+# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+# Mountain View, CA  94043, or:
+#
+# http://www.sgi.com
+#
+# For further information regarding this notice, see:
+#
+# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+#
+# Makefile for XFS on Linux.
+#
+
+
+# we also have source in the subdirectories..
+vpath %.c =  . linux pagebuf support
+
+
+# This needs -I. because everything does #include <xfs.h> instead of "xfs.h".
+# The code is wrong, local files should be included using "xfs.h", not <xfs.h>
+# but I am not going to change every file at the moment.
+EXTRA_CFLAGS +=	 -I. -funsigned-char
+
+ifeq ($(CONFIG_XFS_DEBUG),y)
+	EXTRA_CFLAGS += -g -DSTATIC="" -DDEBUG -DXFSDEBUG
+endif
+ifeq ($(CONFIG_PAGEBUF_DEBUG),y)
+	EXTRA_CFLAGS += -DPAGEBUF_TRACE
+endif
+
+export-objs			:= page_buf.o ktrace.o xfs_globals.o
+
+obj-$(CONFIG_XFS_FS)		+= xfs.o
+
+
+xfs-obj-$(CONFIG_XFS_RT)	+= xfs_rtalloc.o
+
+xfs-obj-$(CONFIG_XFS_QUOTA)	+= xfs_dquot.o \
+				   xfs_dquot_item.o \
+				   xfs_trans_dquot.o \
+				   xfs_qm_syscalls.o \
+				   xfs_qm.o
+
+xfs-obj-$(CONFIG_FS_POSIX_ACL)	+= xfs_acl.o
+xfs-obj-$(CONFIG_FS_POSIX_CAP)	+= xfs_cap.o
+xfs-obj-$(CONFIG_FS_POSIX_MAC)	+= xfs_mac.o
+xfs-obj-$(CONFIG_PROC_FS)	+= xfs_stats.o
+xfs-obj-$(CONFIG_SYSCTL)	+= xfs_sysctl.o
+
+
+xfs-objs			+= $(xfs-obj-y) \
+				   xfs_alloc.o \
+				   xfs_alloc_btree.o \
+				   xfs_attr.o \
+				   xfs_attr_fetch.o \
+				   xfs_attr_leaf.o \
+				   xfs_bit.o \
+				   xfs_bmap.o \
+				   xfs_bmap_btree.o \
+				   xfs_btree.o \
+				   xfs_buf_item.o \
+				   xfs_da_btree.o \
+				   xfs_dir.o \
+				   xfs_dir2.o \
+				   xfs_dir2_block.o \
+				   xfs_dir2_data.o \
+				   xfs_dir2_leaf.o \
+				   xfs_dir2_node.o \
+				   xfs_dir2_sf.o \
+				   xfs_dir2_trace.o \
+				   xfs_dir_leaf.o \
+				   xfs_error.o \
+				   xfs_extfree_item.o \
+				   xfs_fsops.o \
+				   xfs_ialloc.o \
+				   xfs_ialloc_btree.o \
+				   xfs_iget.o \
+				   xfs_inode.o \
+				   xfs_inode_item.o \
+				   xfs_iocore.o \
+				   xfs_itable.o \
+				   xfs_dfrag.o \
+				   xfs_log.o \
+				   xfs_log_recover.o \
+				   xfs_macros.o \
+				   xfs_mount.o \
+				   xfs_rename.o \
+				   xfs_trans.o \
+				   xfs_trans_ail.o \
+				   xfs_trans_buf.o \
+				   xfs_trans_extfree.o \
+				   xfs_trans_inode.o \
+				   xfs_trans_item.o \
+				   xfs_utils.o \
+				   xfs_vfsops.o \
+				   xfs_vnodeops.o \
+				   xfs_rw.o
+
+# Objects in pagebuf/
+xfs-objs			+= page_buf.o \
+				   page_buf_locking.o
+
+# Objects in linux/
+xfs-objs			+= xfs_aops.o \
+				   xfs_behavior.o \
+				   xfs_file.o \
+				   xfs_fs_subr.o \
+				   xfs_globals.o \
+				   xfs_ioctl.o \
+				   xfs_iops.o \
+				   xfs_lrw.o \
+				   xfs_super.o \
+				   xfs_vnode.o
+
+# Objects in support/
+xfs-objs			+= debug.o \
+				   kmem.o \
+				   ktrace.o \
+				   move.o \
+				   mrlock.o \
+				   qsort.o \
+				   uuid.o
+
+# If both xfs and kdb modules are built in then xfsidbg is built in.  If xfs is
+# a module and kdb modules are being compiled then xfsidbg must be a module, to
+# follow xfs.  If xfs is built in then xfsidbg tracks the kdb module state.
+# This must come after the main xfs code so xfs initialises before xfsidbg.
+# KAO
+ifneq ($(CONFIG_KDB_MODULES),)
+  ifeq ($(CONFIG_XFS_FS),y)
+    obj-$(CONFIG_KDB_MODULES)	+= xfsidbg.o
+  else
+    obj-$(CONFIG_XFS_FS)	+= xfsidbg.o
+  endif
+endif
+
+CFLAGS_xfsidbg.o += -I $(TOPDIR)/arch/$(ARCH)/kdb
+
+include $(TOPDIR)/Rules.make
diff --git a/fs/xfs/linux/xfs_aops.c b/fs/xfs/linux/xfs_aops.c
new file mode 100644
index 000000000000..a9209ab7b82c
--- /dev/null
+++ b/fs/xfs/linux/xfs_aops.c
@@ -0,0 +1,949 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/mpage.h>
+#include <linux/iobuf.h>
+
+
+STATIC int linvfs_pb_bmap(struct inode *, loff_t, ssize_t,
+			  struct page_buf_bmap_s *, int);
+STATIC int delalloc_convert(struct inode *, struct page *, int, int);
+
+/*
+ * match_offset_to_mapping
+ * Finds the corresponding mapping in block @map array of the
+ * given @offset within a @page.
+ */
+STATIC page_buf_bmap_t *
+match_offset_to_mapping(
+	struct page		*page,
+	page_buf_bmap_t		*map,
+	unsigned long		offset)
+{
+	loff_t			full_offset;	/* offset from start of file */
+
+	ASSERT(offset < PAGE_CACHE_SIZE);
+
+	full_offset = page->index;		/* NB: using 64bit number */
+	full_offset <<= PAGE_CACHE_SHIFT;	/* offset from file start */
+	full_offset += offset;			/* offset from page start */
+
+	if (full_offset < map->pbm_offset)
+		return NULL;
+	if (map->pbm_offset + map->pbm_bsize > full_offset)
+		return map;
+	return NULL;
+}
+
+STATIC void
+map_buffer_at_offset(
+	struct page		*page,
+	struct buffer_head	*bh,
+	unsigned long		offset,
+	int			block_bits,
+	page_buf_bmap_t		*mp)
+{
+	page_buf_daddr_t	bn;
+	loff_t			delta;
+	int			sector_shift;
+
+	ASSERT(!(mp->pbm_flags & PBMF_HOLE));
+	ASSERT(!(mp->pbm_flags & PBMF_DELAY));
+	ASSERT(!(mp->pbm_flags & PBMF_UNWRITTEN));
+	ASSERT(mp->pbm_bn != PAGE_BUF_DADDR_NULL);
+
+	delta = page->index;
+	delta <<= PAGE_CACHE_SHIFT;
+	delta += offset;
+	delta -= mp->pbm_offset;
+	delta >>= block_bits;
+
+	sector_shift = block_bits - 9;
+	bn = mp->pbm_bn >> sector_shift;
+	bn += delta;
+	ASSERT((bn << sector_shift) >= mp->pbm_bn);
+
+	lock_buffer(bh);
+	bh->b_blocknr = bn;
+	bh->b_bdev = mp->pbm_target->pbr_bdev;
+	set_buffer_mapped(bh);
+	clear_buffer_delay(bh);
+}
+
+/*
+ * Convert delalloc space to real space, do not flush the
+ * data out to disk, that will be done by the caller.
+ */
+STATIC int
+release_page(
+	struct page		*page)
+{
+	struct inode		*inode = (struct inode*)page->mapping->host;
+	unsigned long		end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+	int			ret;
+
+	/* Are we off the end of the file ? */
+	if (page->index >= end_index) {
+		unsigned offset = inode->i_size & (PAGE_CACHE_SIZE-1);
+		if ((page->index >= end_index+1) || !offset) {
+			ret =  -EIO;
+			goto out;
+		}
+	}
+
+	ret = delalloc_convert(inode, page, 0, 0);
+
+out:
+	if (ret < 0) {
+		block_invalidatepage(page, 0);
+		ClearPageUptodate(page);
+
+		return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * Convert delalloc or unmapped space to real space and flush out
+ * to disk.
+ */
+STATIC int
+write_full_page(
+	struct page		*page,
+	int			delalloc)
+{
+	struct inode		*inode = (struct inode*)page->mapping->host;
+	unsigned long		end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+	int			ret;
+
+	/* Are we off the end of the file ? */
+	if (page->index >= end_index) {
+		unsigned offset = inode->i_size & (PAGE_CACHE_SIZE-1);
+		if ((page->index >= end_index+1) || !offset) {
+			ret =  -EIO;
+			goto out;
+		}
+	}
+
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
+	}
+
+	ret = delalloc_convert(inode, page, 1, delalloc == 0);
+
+out:
+	if (ret < 0) {
+		/*
+		 * If it's delalloc and we have nowhere to put it,
+		 * throw it away.
+		 */
+		if (delalloc)
+			block_invalidatepage(page, 0);
+		ClearPageUptodate(page);
+		unlock_page(page);
+	}
+
+	return ret;
+}
+
+/*
+ * Look for a page at index which is unlocked and not mapped
+ * yet - clustering for mmap write case.
+ */
+STATIC unsigned int
+probe_unmapped_page(
+	struct address_space	*mapping,
+	unsigned long		index,
+	unsigned int		pg_offset)
+{
+	struct page		*page;
+	int			ret = 0;
+
+	page = find_get_page(mapping, index);
+	if (!page)
+		return 0;
+	if (TestSetPageLocked(page)) {
+		page_cache_release(page);
+		return 0;
+	}
+	if (page->mapping && PageDirty(page)) {
+		if (!page_has_buffers(page)) {
+			ret = PAGE_CACHE_SIZE;
+		} else {
+			struct buffer_head	*bh, *head;
+			bh = head = page_buffers(page);
+			do {
+				if (buffer_mapped(bh) || !buffer_uptodate(bh)) {
+					break;
+				}
+				ret += bh->b_size;
+				if (ret >= pg_offset)
+					break;
+			} while ((bh = bh->b_this_page) != head);
+		}
+	}
+
+	unlock_page(page);
+	page_cache_release(page);
+	return ret;
+}
+
+STATIC unsigned int
+probe_unmapped_cluster(
+	struct inode		*inode,
+	struct page		*startpage,
+	struct buffer_head	*bh,
+	struct buffer_head	*head)
+{
+	unsigned long		tindex, tlast;
+	unsigned int		len, total = 0;
+	struct address_space	*mapping = inode->i_mapping;
+
+	/* First sum forwards in this page */
+	do {
+		if (buffer_mapped(bh))
+			break;
+		total += bh->b_size;
+	} while ((bh = bh->b_this_page) != head);
+
+	/* if we reached the end of the page, sum forwards in
+	 * following pages.
+	 */
+	if (bh == head) {
+		tlast = inode->i_size >> PAGE_CACHE_SHIFT;
+		for (tindex = startpage->index + 1; tindex < tlast; tindex++) {
+			len = probe_unmapped_page(mapping, tindex,
+							PAGE_CACHE_SIZE);
+			if (!len)
+				break;
+			total += len;
+		}
+		if ((tindex == tlast) && (inode->i_size & ~PAGE_CACHE_MASK)) {
+			len = probe_unmapped_page(mapping, tindex,
+					inode->i_size & ~PAGE_CACHE_MASK);
+			total += len;
+		}
+	}
+	return total;
+}
+
+/*
+ * Probe for a given page (index) in the inode & test if it is delayed.
+ * Returns page locked and with an extra reference count.
+ */
+STATIC struct page *
+probe_page(
+	struct inode		*inode,
+	unsigned long		index)
+{
+	struct page		*page;
+
+	page = find_get_page(inode->i_mapping, index);
+	if (!page)
+		return NULL;
+	if (TestSetPageLocked(page)) {
+		page_cache_release(page);
+		return NULL;
+	}
+	if (page->mapping && page_has_buffers(page)) {
+		struct buffer_head	*bh, *head;
+
+		bh = head = page_buffers(page);
+		do {
+			if (buffer_delay(bh))
+				return page;
+		} while ((bh = bh->b_this_page) != head);
+	}
+	unlock_page(page);
+	page_cache_release(page);
+	return NULL;
+}
+
+STATIC void
+submit_page(
+	struct page		*page,
+	struct buffer_head	*bh_arr[],
+	int			cnt)
+{
+	struct buffer_head	*bh;
+	int			i;
+
+	BUG_ON(PageWriteback(page));
+	SetPageWriteback(page);
+	unlock_page(page);
+
+	if (cnt) {
+		for (i = 0; i < cnt; i++) {
+			bh = bh_arr[i];
+			mark_buffer_async_write(bh);
+			set_buffer_uptodate(bh);
+			clear_buffer_dirty(bh);
+		}
+
+		for (i = 0; i < cnt; i++)
+			submit_bh(WRITE, bh_arr[i]);
+	} else
+		end_page_writeback(page);
+}
+
+STATIC int
+map_page(
+	struct inode		*inode,
+	struct page		*page,
+	page_buf_bmap_t		*maps,
+	struct buffer_head	*bh_arr[],
+	int			startio,
+	int			all_bh)
+{
+	struct buffer_head	*bh, *head;
+	page_buf_bmap_t		*mp = maps, *tmp;
+	unsigned long		end, offset, end_index;
+	int			i = 0, index = 0;
+	int			bbits = inode->i_blkbits;
+
+	end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+	if (page->index < end_index) {
+		end = PAGE_CACHE_SIZE;
+	} else {
+		end = inode->i_size & (PAGE_CACHE_SIZE-1);
+	}
+	bh = head = page_buffers(page);
+	do {
+		offset = i << bbits;
+		if (!buffer_uptodate(bh))
+			continue;
+		if (buffer_mapped(bh) && !buffer_delay(bh) && all_bh) {
+			if (startio && (offset < end)) {
+				lock_buffer(bh);
+				bh_arr[index++] = bh;
+			}
+			continue;
+		}
+		tmp = match_offset_to_mapping(page, mp, offset);
+		if (!tmp)
+			continue;
+		ASSERT(!(tmp->pbm_flags & PBMF_HOLE));
+		ASSERT(!(tmp->pbm_flags & PBMF_DELAY));
+		map_buffer_at_offset(page, bh, offset, bbits, tmp);
+		if (startio && (offset < end)) {
+			bh_arr[index++] = bh;
+		} else {
+			unlock_buffer(bh);
+		}
+	} while (i++, (bh = bh->b_this_page) != head);
+
+	return index;
+}
+
+/*
+ * Allocate & map buffers for page given the extent map. Write it out.
+ * except for the original page of a writepage, this is called on
+ * delalloc pages only, for the original page it is possible that
+ * the page has no mapping at all.
+ */
+STATIC void
+convert_page(
+	struct inode		*inode,
+	struct page		*page,
+	page_buf_bmap_t		*maps,
+	int			startio,
+	int			all_bh)
+{
+	struct buffer_head	*bh_arr[MAX_BUF_PER_PAGE];
+	int			cnt;
+
+	cnt = map_page(inode, page, maps, bh_arr, startio, all_bh);
+	if (startio) {
+		submit_page(page, bh_arr, cnt);
+	} else {
+		unlock_page(page);
+	}
+	page_cache_release(page);
+}
+
+/*
+ * Convert & write out a cluster of pages in the same extent as defined
+ * by mp and following the start page.
+ */
+STATIC void
+cluster_write(
+	struct inode		*inode,
+	unsigned long		tindex,
+	page_buf_bmap_t		*mp,
+	int			startio,
+	int			all_bh)
+{
+	unsigned long		tlast;
+	struct page		*page;
+
+	tlast = (mp->pbm_offset + mp->pbm_bsize) >> PAGE_CACHE_SHIFT;
+	for (; tindex < tlast; tindex++) {
+		if (!(page = probe_page(inode, tindex)))
+			break;
+		convert_page(inode, page, mp, startio, all_bh);
+	}
+}
+
+/*
+ * Calling this without allocate_space set means we are being asked to
+ * flush a dirty buffer head. When called with async_write set then we
+ * are coming from writepage. A writepage call with allocate_space set
+ * means we are being asked to write out all of the page which is before
+ * EOF and therefore need to allocate space for unmapped portions of the
+ * page.
+ */
+STATIC int
+delalloc_convert(
+	struct inode		*inode,		/* inode containing page */
+	struct page		*page,		/* page to convert - locked */
+	int			startio,	/* start io on the page */
+	int			allocate_space)
+{
+	struct buffer_head	*bh, *head;
+	struct buffer_head	*bh_arr[MAX_BUF_PER_PAGE];
+	page_buf_bmap_t		*mp, map;
+	int			i, cnt = 0;
+	int			len, err;
+	unsigned long		p_offset = 0;
+	loff_t			offset;
+	loff_t			end_offset;
+
+	offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+	end_offset = offset + PAGE_CACHE_SIZE;
+	if (end_offset > inode->i_size)
+		end_offset = inode->i_size;
+
+	bh = head = page_buffers(page);
+	mp = NULL;
+
+	len = bh->b_size;
+	do {
+		if (!buffer_uptodate(bh) && !startio) {
+			goto next_bh;
+		}
+
+		if (mp) {
+			mp = match_offset_to_mapping(page, &map, p_offset);
+		}
+
+		if (buffer_delay(bh)) {
+			if (!mp) {
+				err = linvfs_pb_bmap(inode, offset, len, &map,
+						PBF_WRITE|PBF_FILE_ALLOCATE);
+				if (err)
+					goto error;
+				mp = match_offset_to_mapping(page, &map,
+								p_offset);
+			}
+			if (mp) {
+				map_buffer_at_offset(page, bh, p_offset,
+					inode->i_blkbits, mp);
+				if (startio) {
+					bh_arr[cnt++] = bh;
+				} else {
+					unlock_buffer(bh);
+				}
+			}
+		} else if (!buffer_mapped(bh) && allocate_space) {
+			int	size;
+
+			/* Getting here implies an unmapped buffer was found,
+			 * and we are in a path where we need to write the
+			 * whole page out.
+			 */
+			if (!mp) {
+				size = probe_unmapped_cluster(inode, page,
+								bh, head);
+				err = linvfs_pb_bmap(inode, offset, size, &map,
+						PBF_WRITE|PBF_DIRECT);
+				if (err)
+					goto error;
+				mp = match_offset_to_mapping(page, &map,
+								p_offset);
+			}
+			if (mp) {
+				map_buffer_at_offset(page, bh, p_offset,
+					inode->i_blkbits, mp);
+				if (startio) {
+					bh_arr[cnt++] = bh;
+				} else {
+					unlock_buffer(bh);
+				}
+			}
+		} else if (startio && buffer_mapped(bh)) {
+			if (buffer_dirty(bh) || allocate_space) {
+				lock_buffer(bh);
+				bh_arr[cnt++] = bh;
+			}
+		}
+
+next_bh:
+		offset += len;
+		p_offset += len;
+		bh = bh->b_this_page;
+	} while (offset < end_offset);
+
+	if (startio)
+		submit_page(page, bh_arr, cnt);
+
+	if (mp)
+		cluster_write(inode, page->index + 1, mp,
+				startio, allocate_space);
+
+	return 0;
+
+error:
+	for (i = 0; i < cnt; i++) {
+		unlock_buffer(bh_arr[i]);
+	}
+
+	return err;
+}
+
+STATIC int
+linvfs_get_block_core(
+	struct inode		*inode,
+	sector_t		iblock,
+	int			blocks,
+	struct buffer_head	*bh_result,
+	int			create,
+	int			direct,
+	page_buf_flags_t	flags)
+{
+	vnode_t			*vp = LINVFS_GET_VP(inode);
+	page_buf_bmap_t		pbmap;
+	int			retpbbm = 1;
+	int			error;
+	ssize_t			size;
+	loff_t			offset = (loff_t)iblock << inode->i_blkbits;
+
+	if (blocks) {
+		size = blocks << inode->i_blkbits;
+	} else {
+		/* If we are doing writes at the end of the file,
+		 * allocate in chunks
+		 */
+		if (create && (offset >= inode->i_size) && !(flags & PBF_SYNC))
+			size = 1 << XFS_WRITE_IO_LOG;
+		else
+			size = 1 << inode->i_blkbits;
+	}
+
+	VOP_BMAP(vp, offset, size,
+		create ? flags : PBF_READ, NULL,
+		(struct page_buf_bmap_s *)&pbmap, &retpbbm, error);
+	if (error)
+		return -error;
+
+	if (retpbbm == 0)
+		return 0;
+
+	if (pbmap.pbm_bn != PAGE_BUF_DADDR_NULL) {
+		page_buf_daddr_t	bn;
+		loff_t			delta;
+
+		delta = offset - pbmap.pbm_offset;
+		delta >>= inode->i_blkbits;
+
+		bn = pbmap.pbm_bn >> (inode->i_blkbits - 9);
+		bn += delta;
+
+		bh_result->b_blocknr = bn;
+		bh_result->b_bdev = pbmap.pbm_target->pbr_bdev;
+		set_buffer_mapped(bh_result);
+	}
+
+	/* If we previously allocated a block out beyond eof and
+	 * we are now coming back to use it then we will need to
+	 * flag it as new even if it has a disk address.
+	 */
+	if (create &&
+	    ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
+	     (offset >= inode->i_size))) {
+		set_buffer_new(bh_result);
+	}
+
+	if (pbmap.pbm_flags & PBMF_DELAY) {
+		if (unlikely(direct))
+			BUG();
+		if (create) {
+			set_buffer_mapped(bh_result);
+			set_buffer_uptodate(bh_result);
+		}
+		bh_result->b_bdev = pbmap.pbm_target->pbr_bdev;
+		set_buffer_delay(bh_result);
+	}
+
+	if (blocks) {
+		size = (pbmap.pbm_bsize - pbmap.pbm_delta); 
+		bh_result->b_size = min(size, blocks << inode->i_blkbits);
+	}
+
+	return 0;
+}
+
+int
+linvfs_get_block(
+	struct inode		*inode,
+	sector_t		iblock,
+	struct buffer_head	*bh_result,
+	int			create)
+{
+	return linvfs_get_block_core(inode, iblock, 0, bh_result,
+					create, 0, PBF_WRITE);
+}
+
+STATIC int
+linvfs_get_block_sync(
+	struct inode		*inode,
+	sector_t		iblock,
+	struct buffer_head	*bh_result,
+	int			create)
+{
+	return linvfs_get_block_core(inode, iblock, 0, bh_result,
+					create, 0, PBF_SYNC|PBF_WRITE);
+}
+
+STATIC int
+linvfs_get_blocks_direct(
+	struct inode		*inode,
+	sector_t		iblock,
+	unsigned long		max_blocks,
+	struct buffer_head	*bh_result,
+	int			create)
+{
+	return linvfs_get_block_core(inode, iblock, max_blocks, bh_result,
+					create, 1, PBF_WRITE|PBF_DIRECT);
+}
+
+STATIC int
+linvfs_direct_IO(
+	int			rw,
+	struct inode		*inode,
+	char			*buf,
+	loff_t			offset,
+	size_t			count)
+{
+        return generic_direct_IO(rw, inode, buf, offset, count, 
+					linvfs_get_blocks_direct);
+}
+
+STATIC int
+linvfs_pb_bmap(
+	struct inode		*inode,
+	loff_t			offset,
+	ssize_t			count,
+	page_buf_bmap_t		*pbmapp,
+	int			flags)
+{
+	vnode_t			*vp = LINVFS_GET_VP(inode);
+	int			error, nmaps = 1;
+
+retry:
+	if (flags & PBF_FILE_ALLOCATE) {
+		VOP_STRATEGY(vp, offset, count, flags, NULL,
+				pbmapp, &nmaps, error);
+	} else {
+		VOP_BMAP(vp, offset, count, flags, NULL,
+				pbmapp, &nmaps, error);
+	}
+	if (flags & PBF_WRITE) {
+		if (unlikely((flags & PBF_DIRECT) && nmaps &&
+		    (pbmapp->pbm_flags & PBMF_DELAY))) {
+			flags = PBF_WRITE | PBF_FILE_ALLOCATE;
+			goto retry;
+		}
+		VMODIFY(vp);
+	}
+	return -error;
+}
+
+STATIC int
+linvfs_bmap(
+	struct address_space	*mapping,
+	long			block)
+{
+	struct inode		*inode = (struct inode *)mapping->host;
+	vnode_t			*vp = LINVFS_GET_VP(inode);
+	int			error;
+
+	/* block	     - Linux disk blocks    512b */
+	/* bmap input offset - bytes		      1b */
+	/* bmap output bn    - XFS BBs		    512b */
+	/* bmap output delta - bytes		      1b */
+
+	vn_trace_entry(vp, "linvfs_bmap", (inst_t *)__return_address);
+
+	VOP_RWLOCK(vp, VRWLOCK_READ);
+	VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error);
+	VOP_RWUNLOCK(vp, VRWLOCK_READ);
+	return generic_block_bmap(mapping, block, linvfs_get_block);
+}
+
+STATIC int
+linvfs_read_full_page(
+	struct file		*unused,
+	struct page		*page)
+{
+	return block_read_full_page(page, linvfs_get_block);
+}
+
+STATIC int
+linvfs_readpages(
+	struct address_space	*mapping,
+	struct list_head	*pages,
+	unsigned		nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, linvfs_get_block);
+}
+
+
+STATIC int
+count_page_state(
+	struct page		*page,
+	int			*nr_delalloc,
+	int			*nr_unmapped)
+{
+	*nr_delalloc = *nr_unmapped = 0;
+
+	if (page_has_buffers(page)) {
+		struct buffer_head	*bh, *head;
+
+		bh = head = page_buffers(page);
+		do {
+			if (buffer_uptodate(bh) && !buffer_mapped(bh)) {
+				(*nr_unmapped)++;
+				continue;
+			}
+			if (!buffer_delay(bh))
+				continue;
+			(*nr_delalloc)++;
+		} while ((bh = bh->b_this_page) != head);
+		return 1;
+	}
+
+	return 0;
+}
+
+STATIC int
+linvfs_write_full_page(
+	struct page		*page)
+{
+	int			error;
+	int			need_trans;
+	int			nr_delalloc, nr_unmapped;
+
+	if (count_page_state(page, &nr_delalloc, &nr_unmapped)) {
+		need_trans = nr_delalloc + nr_unmapped;
+	} else {
+		need_trans = 1;
+	}
+
+	if ((current->flags & (PF_FSTRANS)) && need_trans)
+		goto out_fail;
+
+	error = write_full_page(page, nr_delalloc);
+
+	return error;
+
+out_fail:
+	set_page_dirty(page);
+	unlock_page(page);
+	return 0;
+}
+
+STATIC int
+linvfs_prepare_write(
+	struct file		*file,
+	struct page		*page,
+	unsigned int		from,
+	unsigned int		to)
+{
+	if (file && (file->f_flags & O_SYNC)) {
+		return block_prepare_write(page, from, to,
+						linvfs_get_block_sync);
+	} else {
+		return block_prepare_write(page, from, to,
+						linvfs_get_block);
+	}
+}
+
+#if 0
+/* Keeping this for now as an example of a better way of
+ * doing O_DIRECT for XFS - the generic path has more
+ * overhead than we want.
+ */
+
+/*
+ * Initiate I/O on a kiobuf of user memory
+ */
+STATIC int
+linvfs_direct_IO(
+	int			rw,
+	struct inode		*inode,
+	struct kiobuf		*iobuf,
+	unsigned long		blocknr,
+	int			blocksize)
+{
+	struct page		**maplist;
+	size_t			page_offset;
+	page_buf_t		*pb;
+	page_buf_bmap_t		map;
+	int			error = 0;
+	int			pb_flags, map_flags, pg_index = 0;
+	size_t			length, total;
+	loff_t			offset;
+	size_t			map_size, size;
+
+	total = length = iobuf->length;
+	offset = blocknr;
+	offset <<= inode->i_blkbits;
+
+	maplist = iobuf->maplist;
+	page_offset = iobuf->offset;
+
+	map_flags = (rw ? PBF_WRITE : PBF_READ) | PBF_DIRECT;
+	pb_flags = (rw ? PBF_WRITE : PBF_READ) | PBF_FORCEIO | _PBF_LOCKABLE;
+	while (length) {
+		error = linvfs_pb_bmap(inode, offset, length, &map, map_flags);
+		if (error)
+			break;
+
+		map_size = map.pbm_bsize - map.pbm_delta;
+		size = min(map_size, length);
+		if (map.pbm_flags & PBMF_HOLE) {
+			size_t	zero_len = size;
+
+			if (rw == WRITE)
+				break;
+
+			/* Need to zero it all */
+			while (zero_len) {
+				struct page	*page;
+				size_t		pg_len;
+
+				pg_len = min((size_t)
+						(PAGE_CACHE_SIZE - page_offset),
+						zero_len);
+
+				page = maplist[pg_index];
+
+				memset(kmap(page) + page_offset, 0, pg_len);
+				flush_dcache_page(page);
+				kunmap(page);
+
+				zero_len -= pg_len;
+				if ((pg_len + page_offset) == PAGE_CACHE_SIZE) {
+					pg_index++;
+					page_offset = 0;
+				} else {
+					page_offset = (page_offset + pg_len) &
+							~PAGE_CACHE_MASK;
+				}
+			}
+		} else {
+			int	pg_count;
+
+			pg_count = (size + page_offset + PAGE_CACHE_SIZE - 1)
+					>> PAGE_CACHE_SHIFT;
+			pb = pagebuf_lookup(map.pbm_target, inode, offset,
+							size, pb_flags);
+			/* Need to hook up pagebuf to kiobuf pages */
+			pb->pb_pages = &maplist[pg_index];
+			pb->pb_offset = page_offset;
+			pb->pb_page_count = pg_count;
+
+			pb->pb_bn = map.pbm_bn + (map.pbm_delta >> 9);
+			pagebuf_iostart(pb, pb_flags);
+			pb->pb_flags &= ~_PBF_LOCKABLE;
+			pagebuf_rele(pb);
+
+			page_offset = (page_offset + size) & ~PAGE_CACHE_MASK;
+			if (page_offset)
+				pg_count--;
+			pg_index += pg_count;
+		}
+
+		offset += size;
+		length -= size;
+	}
+
+	return error ? error : total - length;
+}
+#endif
+
+/*
+ * This gets a page into cleanable state - page locked on entry
+ * kept locked on exit. If the page is marked dirty we should
+ * not come this way.
+ */
+STATIC int
+linvfs_release_page(
+	struct page		*page,
+	int			gfp_mask)
+{
+	int			need_trans;
+	int			nr_delalloc, nr_unmapped;
+
+	if (count_page_state(page, &nr_delalloc, &nr_unmapped)) {
+		need_trans = nr_delalloc;
+	} else {
+		need_trans = 0;
+	}
+
+	if (need_trans == 0) {
+		return try_to_free_buffers(page);
+	}
+
+	if (gfp_mask & __GFP_FS) {
+		if (release_page(page) == 0)
+			return try_to_free_buffers(page);
+	}
+	return 0;
+}
+
+
+struct address_space_operations linvfs_aops = {
+	.readpage		= linvfs_read_full_page,
+	.readpages		= linvfs_readpages,
+	.writepage		= linvfs_write_full_page,
+	.sync_page		= block_sync_page,
+	.releasepage		= linvfs_release_page,
+	.prepare_write		= linvfs_prepare_write,
+	.commit_write		= generic_commit_write,
+	.bmap			= linvfs_bmap,
+	.direct_IO		= linvfs_direct_IO,
+};
diff --git a/fs/xfs/linux/xfs_behavior.c b/fs/xfs/linux/xfs_behavior.c
new file mode 100644
index 000000000000..32d65e2a414e
--- /dev/null
+++ b/fs/xfs/linux/xfs_behavior.c
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ *
+ */
+
+/*
+ * Source file used to associate/disassociate behaviors with virtualized
+ * objects.  See behavior.h for more information about behaviors, etc.
+ *
+ * The implementation is split between functions in this file and macros
+ * in behavior.h.
+ */
+#include <xfs.h>
+
+kmem_zone_t	*bhv_global_zone;
+
+/*
+ * Global initialization function called out of main.
+ */
+void
+bhv_global_init(void)
+{
+	/*
+	 * Initialize a behavior zone used by subsystems using behaviors
+	 * but without any private data.  In the UNIKERNEL case, this zone
+	 * is used only for behaviors that are not yet isolated to a single
+	 * cell.  The only such user is in pshm.c in which a dummy vnode is
+	 * obtained in support of vce avoidance logic.
+	 */
+	bhv_global_zone = kmem_zone_init(sizeof(bhv_desc_t), "bhv_global_zone");
+}
+
+/*
+ * Remove a behavior descriptor from a position in a behavior chain;
+ * the postition is guaranteed not to be the first position.
+ * Should only be called by the bhv_remove() macro.
+ *
+ * The act of modifying the chain is done atomically w.r.t. ops-in-progress
+ * (see comment at top of behavior.h for more info on synchronization).
+ */
+void
+bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp)
+{
+	bhv_desc_t	*curdesc, *prev;
+
+	ASSERT(bhp->bh_first != NULL);
+	ASSERT(bhp->bh_first->bd_next != NULL);
+
+	prev = bhp->bh_first;
+	for (curdesc = bhp->bh_first->bd_next;
+	     curdesc != NULL;
+	     curdesc = curdesc->bd_next) {
+
+		if (curdesc == bdp)
+			break;		/* found it */
+		prev = curdesc;
+	}
+
+	ASSERT(curdesc == bdp);
+	prev->bd_next = bdp->bd_next;	/* remove from after prev */
+					/* atomic wrt oip's */
+}
+
+/*
+ * Look for a specific ops vector on the specified behavior chain.
+ * Return the associated behavior descriptor.  Or NULL, if not found.
+ */
+bhv_desc_t *
+bhv_lookup(bhv_head_t *bhp, void *ops)
+{
+	bhv_desc_t	*curdesc;
+
+	for (curdesc = bhp->bh_first;
+	     curdesc != NULL;
+	     curdesc = curdesc->bd_next) {
+
+		if (curdesc->bd_ops == ops)
+			return curdesc;
+	}
+
+	return NULL;
+}
+
+/*
+ * Look for a specific ops vector on the specified behavior chain.
+ * Return the associated behavior descriptor.  Or NULL, if not found.
+ *
+ * The caller has not read locked the behavior chain, so acquire the
+ * lock before traversing the chain.
+ */
+bhv_desc_t *
+bhv_lookup_unlocked(bhv_head_t *bhp, void *ops)
+{
+	bhv_desc_t	*bdp;
+
+	BHV_READ_LOCK(bhp);
+	bdp = bhv_lookup(bhp, ops);
+	BHV_READ_UNLOCK(bhp);
+
+	return bdp;
+}
+
+/*
+ * Return the base behavior in the chain, or NULL if the chain
+ * is empty.
+ *
+ * The caller has not read locked the behavior chain, so acquire the
+ * lock before traversing the chain.
+ */
+bhv_desc_t *
+bhv_base_unlocked(bhv_head_t *bhp)
+{
+	bhv_desc_t	*curdesc;
+
+	BHV_READ_LOCK(bhp);
+	for (curdesc = bhp->bh_first;
+	     curdesc != NULL;
+	     curdesc = curdesc->bd_next) {
+
+		if (curdesc->bd_next == NULL) {
+			BHV_READ_UNLOCK(bhp);
+			return curdesc;
+		}
+	}
+
+	BHV_READ_UNLOCK(bhp);
+	return NULL;
+}
+
+#define BHVMAGIC (void *)0xf00d
+
+/* ARGSUSED */
+void
+bhv_head_init(
+	bhv_head_t *bhp,
+	char *name)
+{
+	bhp->bh_first = NULL;
+	bhp->bh_lockp = BHVMAGIC;
+}
+
+
+/* ARGSUSED */
+void
+bhv_head_reinit(
+	bhv_head_t *bhp)
+{
+	ASSERT(bhp->bh_first == NULL);
+	ASSERT(bhp->bh_lockp == BHVMAGIC);
+}
+
+
+void
+bhv_insert_initial(
+	bhv_head_t *bhp,
+	bhv_desc_t *bdp)
+{
+	ASSERT(bhp->bh_first == NULL);
+	ASSERT(bhp->bh_lockp == BHVMAGIC);
+	(bhp)->bh_first = bdp;
+}
+
+void
+bhv_head_destroy(
+	bhv_head_t *bhp)
+{
+	ASSERT(bhp->bh_first == NULL);
+	ASSERT(bhp->bh_lockp == BHVMAGIC);
+	bhp->bh_lockp = NULL;
+}
+
diff --git a/fs/xfs/linux/xfs_behavior.h b/fs/xfs/linux/xfs_behavior.h
new file mode 100644
index 000000000000..ca369954fa77
--- /dev/null
+++ b/fs/xfs/linux/xfs_behavior.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_BEHAVIOR_H__
+#define __XFS_BEHAVIOR_H__
+
+/*
+ * Header file used to associate behaviors with virtualized objects.
+ *
+ * A virtualized object is an internal, virtualized representation of
+ * OS entities such as persistent files, processes, or sockets.	 Examples
+ * of virtualized objects include vnodes, vprocs, and vsockets.	 Often
+ * a virtualized object is referred to simply as an "object."
+ *
+ * A behavior is essentially an implementation layer associated with
+ * an object.  Multiple behaviors for an object are chained together,
+ * the order of chaining determining the order of invocation.  Each
+ * behavior of a given object implements the same set of interfaces
+ * (e.g., the VOP interfaces).
+ *
+ * Behaviors may be dynamically inserted into an object's behavior chain,
+ * such that the addition is transparent to consumers that already have
+ * references to the object.  Typically, a given behavior will be inserted
+ * at a particular location in the behavior chain.  Insertion of new
+ * behaviors is synchronized with operations-in-progress (oip's) so that
+ * the oip's always see a consistent view of the chain.
+ *
+ * The term "interpostion" is used to refer to the act of inserting
+ * a behavior such that it interposes on (i.e., is inserted in front
+ * of) a particular other behavior.  A key example of this is when a
+ * system implementing distributed single system image wishes to
+ * interpose a distribution layer (providing distributed coherency)
+ * in front of an object that is otherwise only accessed locally.
+ *
+ * Note that the traditional vnode/inode combination is simply a virtualized
+ * object that has exactly one associated behavior.
+ *
+ * Behavior synchronization is logic which is necessary under certain
+ * circumstances that there is no conflict between ongoing operations
+ * traversing the behavior chain and those dunamically modifying the
+ * behavior chain.  Because behavior synchronization adds extra overhead
+ * to virtual operation invocation, we want to restrict, as much as
+ * we can, the requirement for this extra code, to those situations
+ * in which it is truly necessary.
+ *
+ * Behavior synchronization is needed whenever there's at least one class
+ * of object in the system for which:
+ * 1) multiple behaviors for a given object are supported,
+ * -- AND --
+ * 2a) insertion of a new behavior can happen dynamically at any time during
+ *     the life of an active object,
+ *	-- AND --
+ *	3a) insertion of a new behavior needs to synchronize with existing
+ *	    ops-in-progress.
+ *	-- OR --
+ *	3b) multiple different behaviors can be dynamically inserted at
+ *	    any time during the life of an active object
+ *	-- OR --
+ *	3c) removal of a behavior can occur at any time during the life of
+ *	    an active object.
+ * -- OR --
+ * 2b) removal of a behavior can occur at any time during the life of an
+ *     active object
+ *
+ */
+
+typedef void	bhv_head_lock_t;
+
+/*
+ * Behavior head.  Head of the chain of behaviors.
+ * Contained within each virtualized object data structure.
+ */
+typedef struct bhv_head {
+	struct bhv_desc *bh_first;	/* first behavior in chain */
+	bhv_head_lock_t *bh_lockp;	/* pointer to lock info struct */
+} bhv_head_t;
+
+/*
+ * Behavior descriptor.	 Descriptor associated with each behavior.
+ * Contained within the behavior's private data structure.
+ */
+typedef struct bhv_desc {
+	void		*bd_pdata;	/* private data for this behavior */
+	void		*bd_vobj;	/* virtual object associated with */
+	void		*bd_ops;	/* ops for this behavior */
+	struct bhv_desc *bd_next;	/* next behavior in chain */
+} bhv_desc_t;
+
+/*
+ * Behavior identity field.  A behavior's identity determines the position
+ * where it lives within a behavior chain, and it's always the first field
+ * of the behavior's ops vector. The optional id field further identifies the
+ * subsystem responsible for the behavior.
+ */
+typedef struct bhv_identity {
+	__u16	bi_id;		/* owning subsystem id */
+	__u16	bi_position;	/* position in chain */
+} bhv_identity_t;
+
+typedef bhv_identity_t bhv_position_t;
+
+#define BHV_IDENTITY_INIT(id,pos)	{id, pos}
+
+#define BHV_IDENTITY_INIT_POSITION(pos) BHV_IDENTITY_INIT(0, pos)
+
+
+/*
+ * Define boundaries of position values.
+ */
+#define BHV_POSITION_INVALID	0	/* invalid position number */
+#define BHV_POSITION_BASE	1	/* base (last) implementation layer */
+#define BHV_POSITION_TOP	63	/* top (first) implementation layer */
+
+/*
+ * Plumbing macros.
+ */
+#define BHV_HEAD_FIRST(bhp)	(ASSERT((bhp)->bh_first), (bhp)->bh_first)
+#define BHV_NEXT(bdp)		(ASSERT((bdp)->bd_next), (bdp)->bd_next)
+#define BHV_NEXTNULL(bdp)	((bdp)->bd_next)
+#define BHV_VOBJ(bdp)		(ASSERT((bdp)->bd_vobj), (bdp)->bd_vobj)
+#define BHV_VOBJNULL(bdp)	((bdp)->bd_vobj)
+#define BHV_PDATA(bdp)		(bdp)->bd_pdata
+#define BHV_OPS(bdp)		(bdp)->bd_ops
+#define BHV_IDENTITY(bdp)	((bhv_identity_t *)(bdp)->bd_ops)
+#define BHV_POSITION(bdp)	(BHV_IDENTITY(bdp)->bi_position)
+
+
+#define BHV_READ_LOCK(bhp)
+#define BHV_READ_UNLOCK(bhp)
+#define BHV_NOT_READ_LOCKED(bhp)	1
+#define BHV_IS_WRITE_LOCKED(bhp)	1
+#define BHV_NOT_WRITE_LOCKED(bhp)	1
+
+extern void bhv_head_init(bhv_head_t *, char *);
+extern void bhv_head_destroy(bhv_head_t *);
+extern void bhv_head_reinit(bhv_head_t *);
+extern void bhv_insert_initial(bhv_head_t *, bhv_desc_t *);
+
+/*
+ * Initialize a new behavior descriptor.
+ * Arguments:
+ *   bdp - pointer to behavior descriptor
+ *   pdata - pointer to behavior's private data
+ *   vobj - pointer to associated virtual object
+ *   ops - pointer to ops for this behavior
+ */
+#define bhv_desc_init(bdp, pdata, vobj, ops)		\
+ {							\
+	(bdp)->bd_pdata = pdata;			\
+	(bdp)->bd_vobj = vobj;				\
+	(bdp)->bd_ops = ops;				\
+	(bdp)->bd_next = NULL;				\
+ }
+
+#define BHV_DESC_INIT(so,A,B)	bhv_desc_init(&(so->so_bhv),so,A,B)
+
+/*
+ * Remove a behavior descriptor from a behavior chain.
+ */
+#define bhv_remove(bhp, bdp)				\
+ {							\
+	if ((bhp)->bh_first == (bdp)) {			\
+		/*					\
+		* Remove from front of chain.		\
+		 * Atomic wrt oip's.			\
+		*/					\
+	       (bhp)->bh_first = (bdp)->bd_next;	\
+	 } else {					\
+	       /* remove from non-front of chain */	\
+	       bhv_remove_not_first(bhp, bdp);		\
+	}						\
+	(bdp)->bd_vobj = NULL;				\
+ }
+
+/*
+ * Behavior module prototypes.
+ */
+extern void		bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp);
+extern bhv_desc_t *	bhv_lookup(bhv_head_t *bhp, void *ops);
+extern bhv_desc_t *	bhv_lookup_unlocked(bhv_head_t *bhp, void *ops);
+extern bhv_desc_t *	bhv_base_unlocked(bhv_head_t *bhp);
+
+#endif /* __XFS_BEHAVIOR_H__ */
diff --git a/fs/xfs/linux/xfs_cred.h b/fs/xfs/linux/xfs_cred.h
new file mode 100644
index 000000000000..1f20adc8fd21
--- /dev/null
+++ b/fs/xfs/linux/xfs_cred.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_CRED_H__
+#define __XFS_CRED_H__
+
+/*
+ * Credentials
+ */
+typedef struct cred {
+	/* EMPTY */
+} cred_t;
+
+extern struct cred *sys_cred;
+
+/* this is a hack.. (assums sys_cred is the only cred_t in the system) */
+static __inline int capable_cred(cred_t *cr, int cid)
+{
+	return (cr == sys_cred) ? 1 : capable(cid);
+}
+
+#endif	/* __XFS_CRED_H__ */
diff --git a/fs/xfs/linux/xfs_file.c b/fs/xfs/linux/xfs_file.c
new file mode 100644
index 000000000000..8da94cb6741e
--- /dev/null
+++ b/fs/xfs/linux/xfs_file.c
@@ -0,0 +1,334 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <linux/dcache.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/mman.h> /* for PROT_WRITE */
+
+static struct vm_operations_struct linvfs_file_vm_ops;
+
+
+STATIC ssize_t
+linvfs_read(
+	struct file	*filp,
+	char		*buf,
+	size_t		size,
+	loff_t		*offset)
+{
+	vnode_t		*vp;
+	int		error;
+
+	vp = LINVFS_GET_VP(filp->f_dentry->d_inode);
+	ASSERT(vp);
+
+	VOP_READ(vp, filp, buf, size, offset, NULL, error);
+
+	return(error);
+}
+
+
+STATIC ssize_t
+linvfs_write(
+	struct file	*file,
+	const char	*buf,
+	size_t		count,
+	loff_t		*ppos)
+{
+	struct inode	*inode = file->f_dentry->d_inode;
+	loff_t		pos;
+	vnode_t		*vp;
+	int		err;	/* Use negative errors in this f'n */
+
+	if ((ssize_t) count < 0)
+		return -EINVAL;
+
+	if (!access_ok(VERIFY_READ, buf, count))
+		return -EFAULT;
+
+	pos = *ppos;
+	err = -EINVAL;
+	if (pos < 0)
+		goto out;
+
+	err = file->f_error;
+	if (err) {
+		file->f_error = 0;
+		goto out;
+	}
+
+	vp = LINVFS_GET_VP(inode);
+	ASSERT(vp);
+
+	/* We allow multiple direct writers in, there is no
+	 * potential call to vmtruncate in that path.
+	 */
+	if (!(file->f_flags & O_DIRECT))
+		down(&inode->i_sem);
+
+	VOP_WRITE(vp, file, buf, count, &pos, NULL, err);
+	*ppos = pos;
+
+	if (!(file->f_flags & O_DIRECT))
+		up(&inode->i_sem);
+out:
+
+	return(err);
+}
+
+
+STATIC int
+linvfs_open(
+	struct inode	*inode,
+	struct file	*filp)
+{
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+	int		error;
+
+	if (!(filp->f_flags & O_LARGEFILE) && inode->i_size > MAX_NON_LFS)
+		return -EFBIG;
+
+	ASSERT(vp);
+	VOP_OPEN(vp, NULL, error);
+	return -error;
+}
+
+
+STATIC int
+linvfs_release(
+	struct inode	*inode,
+	struct file	*filp)
+{
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+	int		error = 0;
+
+	if (vp)
+		VOP_RELEASE(vp, error);
+	return -error;
+}
+
+
+STATIC int
+linvfs_fsync(
+	struct file	*filp,
+	struct dentry	*dentry,
+	int		datasync)
+{
+	struct inode	*inode = dentry->d_inode;
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+	int		error;
+	int		flags = FSYNC_WAIT;
+
+	if (datasync)
+		flags |= FSYNC_DATA;
+
+	ASSERT(vp);
+
+	VOP_FSYNC(vp, flags, NULL, (off_t)0, (off_t)-1, error);
+
+	return -error;
+}
+
+/*
+ * linvfs_readdir maps to VOP_READDIR().
+ * We need to build a uio, cred, ...
+ */
+
+#define nextdp(dp)	((struct xfs_dirent *)((char *)(dp) + (dp)->d_reclen))
+
+STATIC int
+linvfs_readdir(
+	struct file	*filp,
+	void		*dirent,
+	filldir_t	filldir)
+{
+	int		error = 0;
+	vnode_t		*vp;
+	uio_t		uio;
+	iovec_t		iov;
+	int		eof = 0;
+	caddr_t		read_buf;
+	int		namelen, size = 0;
+	size_t		rlen = PAGE_CACHE_SIZE << 2;
+	off_t		start_offset;
+	xfs_dirent_t	*dbp = NULL;
+
+	vp = LINVFS_GET_VP(filp->f_dentry->d_inode);
+	ASSERT(vp);
+
+	/* Try fairly hard to get memory */
+	do {
+		if ((read_buf = (caddr_t)kmalloc(rlen, GFP_KERNEL)))
+			break;
+		rlen >>= 1;
+	} while (rlen >= 1024);
+
+	if (read_buf == NULL)
+		return -ENOMEM;
+
+	uio.uio_iov = &iov;
+	uio.uio_fmode = filp->f_mode;
+	uio.uio_segflg = UIO_SYSSPACE;
+	uio.uio_offset = filp->f_pos;
+
+	while (!eof) {
+		uio.uio_resid = iov.iov_len = rlen;
+		iov.iov_base = read_buf;
+		uio.uio_iovcnt = 1;
+
+		start_offset = uio.uio_offset;
+
+		VOP_READDIR(vp, &uio, NULL, &eof, error);
+		if ((uio.uio_offset == start_offset) || error) {
+			size = 0;
+			break;
+		}
+
+		size = rlen - uio.uio_resid;
+		dbp = (xfs_dirent_t *)read_buf;
+		while (size > 0) {
+			namelen = strlen(dbp->d_name);
+
+			if (filldir(dirent, dbp->d_name, namelen,
+					(loff_t) dbp->d_off,
+					(ino_t) dbp->d_ino,
+					DT_UNKNOWN)) {
+				goto done;
+			}
+			size -= dbp->d_reclen;
+			dbp = nextdp(dbp);
+		}
+	}
+done:
+	if (!error) {
+		if (size == 0)
+			filp->f_pos = uio.uio_offset;
+		else if (dbp)
+			filp->f_pos = dbp->d_off;
+	}
+
+	kfree(read_buf);
+	return -error;
+}
+
+STATIC int
+linvfs_file_mmap(
+	struct file	*filp,
+	struct vm_area_struct *vma)
+{
+	struct inode	*ip = filp->f_dentry->d_inode;
+	vnode_t		*vp = LINVFS_GET_VP(ip);
+	vattr_t		va = { .va_mask = AT_UPDATIME };
+	int		error;
+
+	if ((vp->v_type == VREG) && (vp->v_vfsp->vfs_flag & VFS_DMI)) {
+		error = -xfs_dm_send_mmap_event(vma, 0);
+		if (error)
+			return error;
+	}
+
+	vma->vm_ops = &linvfs_file_vm_ops;
+
+	VOP_SETATTR(vp, &va, AT_UPDATIME, NULL, error);
+	UPDATE_ATIME(ip);
+	return 0;
+}
+
+
+STATIC int
+linvfs_ioctl(
+	struct inode	*inode,
+	struct file	*filp,
+	unsigned int	cmd,
+	unsigned long	arg)
+{
+	int		error;
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+
+	ASSERT(vp);
+	VOP_IOCTL(vp, inode, filp, cmd, arg, error);
+	VMODIFY(vp);
+
+	/* NOTE:  some of the ioctl's return positive #'s as a
+	 *	  byte count indicating success, such as
+	 *	  readlink_by_handle.  So we don't "sign flip"
+	 *	  like most other routines.  This means true
+	 *	  errors need to be returned as a negative value.
+	 */
+	return error;
+}
+
+#ifdef HAVE_VMOP_MPROTECT
+STATIC int
+linvfs_mprotect(
+	struct vm_area_struct *vma,
+	unsigned int	newflags)
+{
+	vnode_t		*vp = LINVFS_GET_VP(vma->vm_file->f_dentry->d_inode);
+	int		error = 0;
+
+	if ((vp->v_type == VREG) && (vp->v_vfsp->vfs_flag & VFS_DMI)) {
+		if ((vma->vm_flags & VM_MAYSHARE) &&
+		    (newflags & PROT_WRITE) && !(vma->vm_flags & PROT_WRITE)){
+			error = xfs_dm_send_mmap_event(vma, VM_WRITE);
+		    }
+	}
+	return error;
+}
+#endif /* HAVE_VMOP_MPROTECT */
+
+
+struct file_operations linvfs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= linvfs_read,
+	.write		= linvfs_write,
+	.ioctl		= linvfs_ioctl,
+	.mmap		= linvfs_file_mmap,
+	.open		= linvfs_open,
+	.release	= linvfs_release,
+	.fsync		= linvfs_fsync,
+};
+
+struct file_operations linvfs_dir_operations = {
+	.read		= generic_read_dir,
+	.readdir	= linvfs_readdir,
+	.ioctl		= linvfs_ioctl,
+	.fsync		= linvfs_fsync,
+};
+
+static struct vm_operations_struct linvfs_file_vm_ops = {
+	.nopage		= filemap_nopage,
+#ifdef HAVE_VMOP_MPROTECT
+	.mprotect	= linvfs_mprotect,
+#endif
+};
diff --git a/fs/xfs/linux/xfs_fs_subr.c b/fs/xfs/linux/xfs_fs_subr.c
new file mode 100644
index 000000000000..8d50bd04d718
--- /dev/null
+++ b/fs/xfs/linux/xfs_fs_subr.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+/*
+ * Implementation for VFS_DOUNMOUNT.
+ */
+int
+fs_dounmount(
+	bhv_desc_t	*bdp,
+	int		flags,
+	vnode_t		*rootvp,
+	cred_t		*cr)
+{
+	struct vfs	*vfsp = bhvtovfs(bdp);
+	bhv_desc_t	*fbdp = vfsp->vfs_fbhv;
+	int		error;
+
+	/*
+	 * Wait for sync to finish and lock vfsp.  This also sets the
+	 * VFS_OFFLINE flag.  Once we do this we can give up reference
+	 * the root vnode which we hold to avoid the another unmount
+	 * ripping the vfs out from under us before we get to lock it.
+	 * The VFS_DOUNMOUNT calling convention is that the reference
+	 * on the rot vnode is released whether the call succeeds or
+	 * fails.
+	 */
+	if (rootvp)
+		VN_RELE(rootvp);
+
+	/*
+	 * Now invoke SYNC and UNMOUNT ops, using the PVFS versions is
+	 * OK since we already have a behavior lock as a result of
+	 * being in VFS_DOUNMOUNT.  It is necessary to do things this
+	 * way since using the VFS versions would cause us to get the
+	 * behavior lock twice which can cause deadlock as well as
+	 * making the coding of vfs relocation unnecessarilty difficult
+	 * by making relocations invoked by unmount occur in a different
+	 * environment than those invoked by mount-update.
+	 */
+	PVFS_SYNC(fbdp, SYNC_ATTR|SYNC_DELWRI, cr, error);
+	if (error == 0)
+		PVFS_UNMOUNT(fbdp, flags, cr, error);
+	return error;
+}
+
+/*
+ * Stub for no-op vnode operations that return error status.
+ */
+int
+fs_noerr()
+{
+	return 0;
+}
+
+/*
+ * Operation unsupported under this file system.
+ */
+int
+fs_nosys()
+{
+	return ENOSYS;
+}
+
+/*
+ * Stub for inactive, strategy, and read/write lock/unlock.  Does nothing.
+ */
+/* ARGSUSED */
+void
+fs_noval()
+{
+}
+
+/*
+ * vnode pcache layer for vnode_tosspages.
+ * 'last' parameter unused but left in for IRIX compatibility
+ */
+void
+fs_tosspages(
+	bhv_desc_t	*bdp,
+	xfs_off_t	first,
+	xfs_off_t	last,
+	int		fiopt)
+{
+	vnode_t		*vp = BHV_TO_VNODE(bdp);
+	struct inode	*ip = LINVFS_GET_IP(vp);
+
+	if (VN_CACHED(vp))
+		truncate_inode_pages(ip->i_mapping, first);
+}
+
+
+/*
+ * vnode pcache layer for vnode_flushinval_pages.
+ * 'last' parameter unused but left in for IRIX compatibility
+ */
+void
+fs_flushinval_pages(
+	bhv_desc_t	*bdp,
+	xfs_off_t	first,
+	xfs_off_t	last,
+	int		fiopt)
+{
+	vnode_t		*vp = BHV_TO_VNODE(bdp);
+	struct inode	*ip = LINVFS_GET_IP(vp);
+
+	if (VN_CACHED(vp)) {
+		filemap_fdatawait(ip->i_mapping);
+		filemap_fdatawrite(ip->i_mapping);
+		filemap_fdatawait(ip->i_mapping);
+
+		truncate_inode_pages(ip->i_mapping, first);
+	}
+}
+
+/*
+ * vnode pcache layer for vnode_flush_pages.
+ * 'last' parameter unused but left in for IRIX compatibility
+ */
+int
+fs_flush_pages(
+	bhv_desc_t	*bdp,
+	xfs_off_t	first,
+	xfs_off_t	last,
+	uint64_t	flags,
+	int		fiopt)
+{
+	vnode_t		*vp = BHV_TO_VNODE(bdp);
+	struct inode	*ip = LINVFS_GET_IP(vp);
+
+	if (VN_CACHED(vp)) {
+		filemap_fdatawait(ip->i_mapping);
+		filemap_fdatawrite(ip->i_mapping);
+		filemap_fdatawait(ip->i_mapping);
+	}
+
+	return 0;
+}
+
diff --git a/fs/xfs/linux/xfs_fs_subr.h b/fs/xfs/linux/xfs_fs_subr.h
new file mode 100644
index 000000000000..b6d780b68677
--- /dev/null
+++ b/fs/xfs/linux/xfs_fs_subr.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUBR_H__
+#define __XFS_SUBR_H__
+
+/*
+ * Utilities shared among file system implementations.
+ */
+
+struct cred;
+
+extern int	fs_noerr(void);
+extern int	fs_nosys(void);
+extern int	fs_nodev(void);
+extern void	fs_noval(void);
+extern int	fs_dounmount(bhv_desc_t *, int, vnode_t *, struct cred *);
+extern void	fs_tosspages(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
+extern void	fs_flushinval_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
+extern int	fs_flush_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, uint64_t, int);
+
+#endif	/* __XFS_FS_SUBR_H__ */
diff --git a/fs/xfs/linux/xfs_globals.c b/fs/xfs/linux/xfs_globals.c
new file mode 100644
index 000000000000..54a4343289f1
--- /dev/null
+++ b/fs/xfs/linux/xfs_globals.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * This file contains globals needed by XFS that were normally defined
+ * somewhere else in IRIX.
+ */
+
+#include <xfs.h>
+
+uint64_t	xfs_panic_mask;		/* set to cause more panics */
+unsigned long	xfs_physmem;
+
+/*
+ * restricted_chown = 1	 bsd style chown(2), only super-user can give away files
+ * restricted_chown = 0	 sysV style chown(2), non super-user can give away files
+ */
+int		restricted_chown = 1;
+
+/*
+ * Used to serialize atomicIncWithWrap.
+ */
+spinlock_t Atomic_spin = SPIN_LOCK_UNLOCKED;
+
+/*
+ * Global system credential structure.
+ */
+cred_t sys_cred_val, *sys_cred = &sys_cred_val;
+
+/*
+ * The global quota manager. There is only one of these for the entire
+ * system, _not_ one per file system. XQM keeps track of the overall
+ * quota functionality, including maintaining the freelist and hash
+ * tables of dquots.
+ */
+struct xfs_qm	*xfs_Gqm;
+mutex_t		xfs_Gqm_lock;
+
+/* Export XFS symbols used by xfsidbg */
+EXPORT_SYMBOL(xfs_Gqm);
+EXPORT_SYMBOL(xfs_next_bit);
+EXPORT_SYMBOL(xfs_contig_bits);
diff --git a/fs/xfs/linux/xfs_globals.h b/fs/xfs/linux/xfs_globals.h
new file mode 100644
index 000000000000..943e029f1d42
--- /dev/null
+++ b/fs/xfs/linux/xfs_globals.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_GLOBALS_H__
+#define __XFS_GLOBALS_H__
+
+/*
+ * This file declares globals needed by XFS that were normally defined
+ * somewhere else in IRIX.
+ */
+
+extern uint64_t xfs_panic_mask;		/* set to cause more panics */
+
+extern int	restricted_chown;
+extern unsigned long	xfs_physmem;
+
+extern struct cred *sys_cred;
+
+extern struct xfs_qm	*xfs_Gqm;
+extern mutex_t		xfs_Gqm_lock;
+
+#endif	/* __XFS_GLOBALS_H__ */
diff --git a/fs/xfs/linux/xfs_ioctl.c b/fs/xfs/linux/xfs_ioctl.c
new file mode 100644
index 000000000000..a2b5f0162ccd
--- /dev/null
+++ b/fs/xfs/linux/xfs_ioctl.c
@@ -0,0 +1,1073 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <xfs_fsops.h>
+#include <xfs_dfrag.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/iobuf.h>
+
+
+extern int xfs_change_file_space(bhv_desc_t *, int,
+			xfs_flock64_t *, xfs_off_t, cred_t *, int);
+extern int xfs_set_dmattrs(bhv_desc_t *, u_int, u_int16_t, cred_t *);
+
+
+/*
+ * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
+ * a file or fs handle.
+ *
+ * XFS_IOC_PATH_TO_FSHANDLE
+ *    returns fs handle for a mount point or path within that mount point
+ * XFS_IOC_FD_TO_HANDLE
+ *    returns full handle for a FD opened in user space
+ * XFS_IOC_PATH_TO_HANDLE
+ *    returns full handle for a path
+ */
+STATIC int
+xfs_find_handle(
+	unsigned int		cmd,
+	unsigned long		arg)
+{
+	int			hsize;
+	xfs_handle_t		handle;
+	xfs_fsop_handlereq_t	hreq;
+	struct inode		*inode;
+	struct vnode		*vp;
+
+	if (copy_from_user(&hreq, (xfs_fsop_handlereq_t *)arg, sizeof(hreq)))
+		return -XFS_ERROR(EFAULT);
+
+	bzero((char *)&handle, sizeof(handle));
+
+	switch (cmd) {
+	case XFS_IOC_PATH_TO_FSHANDLE:
+	case XFS_IOC_PATH_TO_HANDLE: {
+		struct nameidata	nd;
+		char			*path;
+		int			error;
+
+		/* we need the path */
+		path = getname(hreq.path);
+		if (IS_ERR(path))
+			return PTR_ERR(path);
+
+		/* traverse the path */
+		error = path_lookup(path, 0, &nd);
+		putname(path);
+		if (error)
+			return error;
+
+		ASSERT(nd.dentry);
+		ASSERT(nd.dentry->d_inode);
+		inode = igrab(nd.dentry->d_inode);
+		path_release(&nd);
+		break;
+	}
+
+	case XFS_IOC_FD_TO_HANDLE: {
+		struct file	*file;
+
+		file = fget(hreq.fd);
+		if (!file)
+		    return -EBADF;
+
+		ASSERT(file->f_dentry);
+		ASSERT(file->f_dentry->d_inode);
+		inode = igrab(file->f_dentry->d_inode);
+		fput(file);
+
+		break;
+	}
+
+	default:
+		ASSERT(0);
+		return -XFS_ERROR(EINVAL);
+	}
+
+	if (inode->i_sb->s_magic != XFS_SB_MAGIC) {
+		/* we're not in XFS anymore, Toto */
+		iput(inode);
+		return -XFS_ERROR(EINVAL);
+	}
+
+	/* we need the vnode */
+	vp = LINVFS_GET_VP(inode);
+	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
+		iput(inode);
+		return -XFS_ERROR(EBADF);
+	}
+
+	/* now we can grab the fsid */
+	memcpy(&handle.ha_fsid, vp->v_vfsp->vfs_altfsid, sizeof(xfs_fsid_t));
+	hsize = sizeof(xfs_fsid_t);
+
+	if (cmd != XFS_IOC_PATH_TO_FSHANDLE) {
+		xfs_inode_t	*ip;
+		bhv_desc_t	*bhv;
+		int		lock_mode;
+
+		/* need to get access to the xfs_inode to read the generation */
+		VN_BHV_READ_LOCK(&(vp)->v_bh);
+		bhv = VNODE_TO_FIRST_BHV(vp);
+		ASSERT(bhv);
+		ip = XFS_BHVTOI(bhv);
+		ASSERT(ip);
+		lock_mode = xfs_ilock_map_shared(ip);
+
+		/* fill in fid section of handle from inode */
+		handle.ha_fid.xfs_fid_len = sizeof(xfs_fid_t) -
+					    sizeof(handle.ha_fid.xfs_fid_len);
+		handle.ha_fid.xfs_fid_pad = 0;
+		handle.ha_fid.xfs_fid_gen = ip->i_d.di_gen;
+		handle.ha_fid.xfs_fid_ino = ip->i_ino;
+
+		xfs_iunlock_map_shared(ip, lock_mode);
+		VN_BHV_READ_UNLOCK(&(vp)->v_bh);
+
+		hsize = XFS_HSIZE(handle);
+	}
+
+	/* now copy our handle into the user buffer & write out the size */
+	if (copy_to_user((xfs_handle_t *)hreq.ohandle, &handle, hsize) ||
+	    copy_to_user(hreq.ohandlen, &hsize, sizeof(__s32))) {
+		iput(inode);
+		return -XFS_ERROR(EFAULT);
+	}
+
+	iput(inode);
+	return 0;
+}
+
+
+/*
+ * Convert userspace handle data into vnode (and inode).
+ * We [ab]use the fact that all the fsop_handlereq ioctl calls
+ * have a data structure argument whose first component is always
+ * a xfs_fsop_handlereq_t, so we can cast to and from this type.
+ * This allows us to optimise the copy_from_user calls and gives
+ * a handy, shared routine.
+ *
+ * If no error, caller must always VN_RELE the returned vp.
+ */
+STATIC int
+xfs_vget_fsop_handlereq(
+	xfs_mount_t		*mp,
+	struct inode		*parinode,	/* parent inode pointer	   */
+	int			cap,		/* capability level for op */
+	unsigned long		arg,		/* userspace data pointer  */
+	unsigned long		size,		/* size of expected struct */
+	/* output arguments */
+	xfs_fsop_handlereq_t	*hreq,
+	vnode_t			**vp,
+	struct inode		**inode)
+{
+	void			*hanp;
+	size_t			hlen;
+	xfs_fid_t		*xfid;
+	xfs_handle_t		*handlep;
+	xfs_handle_t		handle;
+	xfs_inode_t		*ip;
+	struct inode		*inodep;
+	vnode_t			*vpp;
+	__u32			igen;
+	ino_t			ino;
+	int			error;
+
+	if (!capable(cap))
+		return XFS_ERROR(EPERM);
+
+	/*
+	 * Only allow handle opens under a directory.
+	 */
+	if (!S_ISDIR(parinode->i_mode))
+		return XFS_ERROR(ENOTDIR);
+
+	/*
+	 * Copy the handle down from the user and validate
+	 * that it looks to be in the correct format.
+	 */
+	if (copy_from_user(hreq, (struct xfs_fsop_handlereq *)arg, size))
+		return XFS_ERROR(EFAULT);
+
+	hanp = hreq->ihandle;
+	hlen = hreq->ihandlen;
+	handlep = &handle;
+
+	if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep))
+		return XFS_ERROR(EINVAL);
+	if (copy_from_user(handlep, hanp, hlen))
+		return XFS_ERROR(EFAULT);
+	if (hlen < sizeof(*handlep))
+		bzero(((char *)handlep) + hlen, sizeof(*handlep) - hlen);
+	if (hlen > sizeof(handlep->ha_fsid)) {
+		if (handlep->ha_fid.xfs_fid_len !=
+				(hlen - sizeof(handlep->ha_fsid)
+					- sizeof(handlep->ha_fid.xfs_fid_len))
+		    || handlep->ha_fid.xfs_fid_pad)
+			return XFS_ERROR(EINVAL);
+	}
+
+	/*
+	 * Crack the handle, obtain the inode # & generation #
+	 */
+	xfid = (struct xfs_fid *)&handlep->ha_fid;
+	if (xfid->xfs_fid_len == sizeof(*xfid) - sizeof(xfid->xfs_fid_len)) {
+		ino  = xfid->xfs_fid_ino;
+		igen = xfid->xfs_fid_gen;
+	} else {
+		return XFS_ERROR(EINVAL);
+	}
+
+	/*
+	 * Get the XFS inode, building a vnode to go with it.
+	 */
+	error = xfs_iget(mp, NULL, ino, XFS_ILOCK_SHARED, &ip, 0);
+	if (error)
+		return error;
+	if (ip == NULL)
+		return XFS_ERROR(EIO);
+	if (ip->i_d.di_mode == 0 || ip->i_d.di_gen != igen) {
+		xfs_iput_new(ip, XFS_ILOCK_SHARED);
+		return XFS_ERROR(ENOENT);
+	}
+
+	vpp = XFS_ITOV(ip);
+	inodep = LINVFS_GET_IP(vpp);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	error = linvfs_revalidate_core(inodep, ATTR_COMM);
+	if (error) {
+		iput(inodep);
+		/* this error is (-) but our callers expect + */
+		return XFS_ERROR(-error);
+	}
+
+	*vp = vpp;
+	*inode = inodep;
+	return 0;
+}
+
+STATIC int
+xfs_open_by_handle(
+	xfs_mount_t		*mp,
+	unsigned long		arg,
+	struct file		*parfilp,
+	struct inode		*parinode)
+{
+	int			error;
+	int			new_fd;
+	int			permflag;
+	struct file		*filp;
+	struct inode		*inode;
+	struct dentry		*dentry;
+	vnode_t			*vp;
+	xfs_fsop_handlereq_t	hreq;
+
+	error = xfs_vget_fsop_handlereq(mp, parinode, CAP_SYS_ADMIN, arg,
+					sizeof(xfs_fsop_handlereq_t),
+					&hreq, &vp, &inode);
+	if (error)
+		return -error;
+
+	/* Restrict xfs_open_by_handle to directories & regular files. */
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
+		iput(inode);
+		return -XFS_ERROR(EINVAL);
+	}
+
+#if BITS_PER_LONG != 32
+	hreq.oflags |= O_LARGEFILE;
+#endif
+	/* Put open permission in namei format. */
+	permflag = hreq.oflags;
+	if ((permflag+1) & O_ACCMODE)
+		permflag++;
+	if (permflag & O_TRUNC)
+		permflag |= 2;
+
+	/* Can't write directories. */
+	if ( S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) {
+		iput(inode);
+		return -XFS_ERROR(EISDIR);
+	}
+
+	if ((new_fd = get_unused_fd()) < 0) {
+		iput(inode);
+		return new_fd;
+	}
+
+	dentry = d_alloc_anon(inode);
+	if (dentry == NULL) {
+		iput(inode);
+		put_unused_fd(new_fd);
+		return -XFS_ERROR(ENOMEM);
+	}
+
+	/* Ensure umount returns EBUSY on umounts while this file is open. */
+	mntget(parfilp->f_vfsmnt);
+
+	/* Create file pointer. */
+	filp = dentry_open(dentry, parfilp->f_vfsmnt, hreq.oflags);
+	if (IS_ERR(filp)) {
+		put_unused_fd(new_fd);
+		return -XFS_ERROR(-PTR_ERR(filp));
+	}
+	filp->f_mode |= FINVIS;
+
+	fd_install(new_fd, filp);
+	return new_fd;
+}
+
+STATIC int
+xfs_readlink_by_handle(
+	xfs_mount_t		*mp,
+	unsigned long		arg,
+	struct file		*parfilp,
+	struct inode		*parinode)
+{
+	int			error;
+	struct iovec		aiov;
+	struct uio		auio;
+	struct inode		*inode;
+	xfs_fsop_handlereq_t	hreq;
+	vnode_t			*vp;
+	__u32			olen;
+
+	error = xfs_vget_fsop_handlereq(mp, parinode, CAP_SYS_ADMIN, arg,
+					sizeof(xfs_fsop_handlereq_t),
+					&hreq, &vp, &inode);
+	if (error)
+		return -error;
+
+	/* Restrict this handle operation to symlinks only. */
+	if (vp->v_type != VLNK) {
+		VN_RELE(vp);
+		return -XFS_ERROR(EINVAL);
+	}
+
+	if (copy_from_user(&olen, hreq.ohandlen, sizeof(__u32))) {
+		VN_RELE(vp);
+		return -XFS_ERROR(EFAULT);
+	}
+	aiov.iov_len	= olen;
+	aiov.iov_base	= hreq.ohandle;
+
+	auio.uio_iov	= &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_fmode	= FINVIS;
+	auio.uio_offset = 0;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_resid	= olen;
+
+	VOP_READLINK(vp, &auio, NULL, error);
+
+	VN_RELE(vp);
+	return (olen - auio.uio_resid);
+}
+
+STATIC int
+xfs_fssetdm_by_handle(
+	xfs_mount_t		*mp,
+	unsigned long		arg,
+	struct file		*parfilp,
+	struct inode		*parinode)
+{
+	int			error;
+	struct fsdmidata	fsd;
+	xfs_fsop_setdm_handlereq_t dmhreq;
+	struct inode		*inode;
+	bhv_desc_t		*bdp;
+	vnode_t			*vp;
+
+	error = xfs_vget_fsop_handlereq(mp, parinode, CAP_MKNOD, arg,
+					sizeof(xfs_fsop_setdm_handlereq_t),
+					(xfs_fsop_handlereq_t *)&dmhreq,
+					&vp, &inode);
+	if (error)
+		return -error;
+
+	if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) {
+		VN_RELE(vp);
+		return -XFS_ERROR(EFAULT);
+	}
+
+	bdp = bhv_base_unlocked(VN_BHV_HEAD(vp));
+	error = xfs_set_dmattrs(bdp, fsd.fsd_dmevmask, fsd.fsd_dmstate, NULL);
+
+	VN_RELE(vp);
+	if (error)
+		return -error;
+	return 0;
+}
+
+STATIC int
+xfs_attrlist_by_handle(
+	xfs_mount_t		*mp,
+	unsigned long		arg,
+	struct file		*parfilp,
+	struct inode		*parinode)
+{
+	int			error;
+	attrlist_cursor_kern_t	*cursor;
+	xfs_fsop_attrlist_handlereq_t al_hreq;
+	struct inode		*inode;
+	vnode_t			*vp;
+
+	error = xfs_vget_fsop_handlereq(mp, parinode, CAP_SYS_ADMIN, arg,
+					sizeof(xfs_fsop_attrlist_handlereq_t),
+					(xfs_fsop_handlereq_t *)&al_hreq,
+					&vp, &inode);
+	if (error)
+		return -error;
+
+	cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
+	VOP_ATTR_LIST(vp, al_hreq.buffer, al_hreq.buflen, al_hreq.flags,
+			cursor, NULL, error);
+	VN_RELE(vp);
+	if (error)
+		return -error;
+	return 0;
+}
+
+STATIC int
+xfs_attrmulti_by_handle(
+	xfs_mount_t		*mp,
+	unsigned long		arg,
+	struct file		*parfilp,
+	struct inode		*parinode)
+{
+	int			error;
+	xfs_attr_multiop_t	*ops;
+	xfs_fsop_attrmulti_handlereq_t am_hreq;
+	struct inode		*inode;
+	vnode_t			*vp;
+	int			i, size;
+
+	error = xfs_vget_fsop_handlereq(mp, parinode, CAP_SYS_ADMIN, arg,
+					sizeof(xfs_fsop_attrmulti_handlereq_t),
+					(xfs_fsop_handlereq_t *)&am_hreq,
+					&vp, &inode);
+	if (error)
+		return -error;
+
+	size = am_hreq.opcount * sizeof(attr_multiop_t);
+	ops = (xfs_attr_multiop_t *)kmalloc(size, GFP_KERNEL);
+	if (!ops) {
+		VN_RELE(vp);
+		return -XFS_ERROR(ENOMEM);
+	}
+
+	if (copy_from_user(ops, am_hreq.ops, size)) {
+		kfree(ops);
+		VN_RELE(vp);
+		return -XFS_ERROR(EFAULT);
+	}
+
+	for (i = 0; i < am_hreq.opcount; i++) {
+		switch(ops[i].am_opcode) {
+		case ATTR_OP_GET:
+			VOP_ATTR_GET(vp,ops[i].am_attrname, ops[i].am_attrvalue,
+					&ops[i].am_length, ops[i].am_flags,
+					NULL, ops[i].am_error);
+			break;
+		case ATTR_OP_SET:
+			VOP_ATTR_SET(vp,ops[i].am_attrname, ops[i].am_attrvalue,
+					ops[i].am_length, ops[i].am_flags,
+					NULL, ops[i].am_error);
+			break;
+		case ATTR_OP_REMOVE:
+			VOP_ATTR_REMOVE(vp, ops[i].am_attrname, ops[i].am_flags,
+					NULL, ops[i].am_error);
+			break;
+		default:
+			ops[i].am_error = EINVAL;
+		}
+	}
+
+	if (copy_to_user(am_hreq.ops, ops, size))
+		error = -XFS_ERROR(EFAULT);
+
+	kfree(ops);
+	VN_RELE(vp);
+	return error;
+}
+
+/* prototypes for a few of the stack-hungry cases that have
+ * their own functions.  Functions are defined after their use
+ * so gcc doesn't get fancy and inline them with -03 */
+
+int xfs_ioc_space(
+	bhv_desc_t		*bdp,
+	vnode_t			*vp,
+	struct file		*filp,
+	unsigned int		cmd,
+	unsigned long		arg);
+
+int xfs_ioc_bulkstat(
+	xfs_mount_t		*mp,
+	unsigned int		cmd,
+	unsigned long		arg);
+
+int xfs_ioc_fsgeometry_v1(
+	xfs_mount_t		*mp,
+	unsigned long		arg);
+
+int xfs_ioc_fsgeometry(
+	xfs_mount_t		*mp,
+	unsigned long		arg);
+
+int xfs_ioc_xattr(
+	vnode_t			*vp,
+	struct file		*filp,
+	unsigned int		cmd,
+	unsigned long		arg);
+
+int xfs_ioc_getbmap(
+	bhv_desc_t		*bdp,
+	struct file		*filp,
+	unsigned int		cmd,
+	unsigned long		arg);
+
+int xfs_ioc_getbmapx(
+	bhv_desc_t		*bdp,
+	unsigned long		arg);
+
+int
+xfs_ioctl(
+	bhv_desc_t		*bdp,
+	struct inode		*inode,
+	struct file		*filp,
+	unsigned int		cmd,
+	unsigned long		arg)
+{
+	int			error;
+	vnode_t			*vp;
+	xfs_inode_t		*ip;
+	xfs_mount_t		*mp;
+
+	vp = LINVFS_GET_VP(inode);
+
+	vn_trace_entry(vp, "xfs_ioctl", (inst_t *)__return_address);
+
+	ip = XFS_BHVTOI(bdp);
+	mp = ip->i_mount;
+
+	switch (cmd) {
+
+	case XFS_IOC_ALLOCSP:
+	case XFS_IOC_FREESP:
+	case XFS_IOC_RESVSP:
+	case XFS_IOC_UNRESVSP:
+	case XFS_IOC_ALLOCSP64:
+	case XFS_IOC_FREESP64:
+	case XFS_IOC_RESVSP64:
+	case XFS_IOC_UNRESVSP64:
+		return xfs_ioc_space(bdp, vp, filp, cmd, arg);
+
+	case XFS_IOC_DIOINFO: {
+		struct dioattr	da;
+
+		da.d_miniosz = mp->m_sb.sb_blocksize;
+		da.d_mem = mp->m_sb.sb_blocksize;
+
+		/*
+		 * this only really needs to be BBSIZE.
+		 * it is set to the file system block size to
+		 * avoid having to do block zeroing on short writes.
+		 */
+		da.d_maxiosz = XFS_FSB_TO_B(mp,
+				XFS_B_TO_FSBT(mp, KIO_MAX_ATOMIC_IO << 10));
+
+		if (copy_to_user((struct dioattr *)arg, &da, sizeof(da)))
+			return -XFS_ERROR(EFAULT);
+		return 0;
+	}
+
+	case XFS_IOC_FSBULKSTAT_SINGLE:
+	case XFS_IOC_FSBULKSTAT:
+	case XFS_IOC_FSINUMBERS:
+		return xfs_ioc_bulkstat(mp, cmd, arg);
+
+	case XFS_IOC_FSGEOMETRY_V1:
+		return xfs_ioc_fsgeometry_v1(mp, arg);
+
+	case XFS_IOC_FSGEOMETRY:
+		return xfs_ioc_fsgeometry(mp, arg);
+
+	case XFS_IOC_FSGETXATTR:
+	case XFS_IOC_FSSETXATTR:
+	case XFS_IOC_FSGETXATTRA:
+		return xfs_ioc_xattr(vp, filp, cmd, arg);
+
+	case XFS_IOC_FSSETDM: {
+		struct fsdmidata	dmi;
+
+		if (copy_from_user(&dmi, (struct fsdmidata *)arg, sizeof(dmi)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_set_dmattrs(bdp, dmi.fsd_dmevmask, dmi.fsd_dmstate,
+							NULL);
+		if (error)
+			return -error;
+		return 0;
+	}
+
+	case XFS_IOC_GETBMAP:
+	case XFS_IOC_GETBMAPA:
+		return xfs_ioc_getbmap(bdp, filp, cmd, arg);
+
+	case XFS_IOC_GETBMAPX:
+		return xfs_ioc_getbmapx(bdp, arg);
+
+	case XFS_IOC_FD_TO_HANDLE:
+	case XFS_IOC_PATH_TO_HANDLE:
+	case XFS_IOC_PATH_TO_FSHANDLE:
+		return xfs_find_handle(cmd, arg);
+
+	case XFS_IOC_OPEN_BY_HANDLE:
+		return xfs_open_by_handle(mp, arg, filp, inode);
+
+	case XFS_IOC_FSSETDM_BY_HANDLE:
+		return xfs_fssetdm_by_handle(mp, arg, filp, inode);
+
+	case XFS_IOC_READLINK_BY_HANDLE:
+		return xfs_readlink_by_handle(mp, arg, filp, inode);
+
+	case XFS_IOC_ATTRLIST_BY_HANDLE:
+		return xfs_attrlist_by_handle(mp, arg, filp, inode);
+
+	case XFS_IOC_ATTRMULTI_BY_HANDLE:
+		return xfs_attrmulti_by_handle(mp, arg, filp, inode);
+
+	case XFS_IOC_SWAPEXT: {
+		error = xfs_swapext((struct xfs_swapext *)arg);
+		if (error)
+			return -error;
+		return 0;
+	}
+
+	case XFS_IOC_FSCOUNTS: {
+		xfs_fsop_counts_t out;
+
+		error = xfs_fs_counts(mp, &out);
+		if (error)
+			return -error;
+
+		if (copy_to_user((char *)arg, &out, sizeof(out)))
+			return -XFS_ERROR(EFAULT);
+		return 0;
+	}
+
+	case XFS_IOC_SET_RESBLKS: {
+		xfs_fsop_resblks_t inout;
+		__uint64_t	   in;
+
+		/* Only allow the sys admin to reserve space unless
+		 * unwritten extents are enabled.
+		 */
+		if (!XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) &&
+		    !capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&inout, (char *)arg, sizeof(inout)))
+			return -XFS_ERROR(EFAULT);
+
+		/* input parameter is passed in resblks field of structure */
+		in = inout.resblks;
+		error = xfs_reserve_blocks(mp, &in, &inout);
+
+		if (copy_to_user((char *)arg, &inout, sizeof(inout)))
+			return -XFS_ERROR(EFAULT);
+		return 0;
+	}
+
+	case XFS_IOC_GET_RESBLKS: {
+		xfs_fsop_resblks_t out;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		error = xfs_reserve_blocks(mp, NULL, &out);
+		if (error)
+			return -error;
+
+		if (copy_to_user((char *)arg, &out, sizeof(out)))
+			return -XFS_ERROR(EFAULT);
+
+		return 0;
+	}
+
+	case XFS_IOC_FSGROWFSDATA: {
+		xfs_growfs_data_t in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&in, (char *)arg, sizeof(in)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_growfs_data(mp, &in);
+		if (error)
+			return -error;
+		return 0;
+	}
+
+	case XFS_IOC_FSGROWFSLOG: {
+		xfs_growfs_log_t in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&in, (char *)arg, sizeof(in)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_growfs_log(mp, &in);
+		if (error)
+			return -error;
+		return 0;
+	}
+
+	case XFS_IOC_FSGROWFSRT: {
+		xfs_growfs_rt_t in;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&in, (char *)arg, sizeof(in)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_growfs_rt(mp, &in);
+		if (error)
+			return -error;
+		return 0;
+	}
+
+	case XFS_IOC_FREEZE:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		xfs_fs_freeze(mp);
+		return 0;
+
+	case XFS_IOC_THAW:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		xfs_fs_thaw(mp);
+		return 0;
+
+	case XFS_IOC_ERROR_INJECTION: {
+		xfs_error_injection_t in;
+
+		if (copy_from_user(&in, (char *)arg, sizeof(in)))
+			return -XFS_ERROR(EFAULT);
+
+		error = xfs_errortag_add(in.errtag, mp);
+		if (error)
+			return -error;
+		return 0;
+	}
+
+	case XFS_IOC_ERROR_CLEARALL:
+		error = xfs_errortag_clearall(mp);
+		return -error;
+
+	default:
+		return -ENOTTY;
+	}
+}
+
+int xfs_ioc_space(
+	bhv_desc_t		*bdp,
+	vnode_t			*vp,
+	struct file		*filp,
+	unsigned int		cmd,
+	unsigned long		arg)
+{
+	xfs_flock64_t	bf;
+	int		attr_flags = 0;
+	int		error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -XFS_ERROR(EPERM);
+
+	if (filp->f_flags & O_RDONLY)
+		return -XFS_ERROR(EBADF);
+
+	if (vp->v_type != VREG)
+		return -XFS_ERROR(EINVAL);
+
+	if (copy_from_user(&bf, (xfs_flock64_t *)arg, sizeof(bf)))
+		return -XFS_ERROR(EFAULT);
+
+	if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+		attr_flags |= ATTR_NONBLOCK;
+	if (filp->f_mode & FINVIS)
+		attr_flags |= ATTR_DMI;
+
+	error = xfs_change_file_space(bdp, cmd, &bf, filp->f_pos,
+					      NULL, attr_flags);
+	return -error;
+}
+
+int xfs_ioc_bulkstat(
+	xfs_mount_t		*mp,
+	unsigned int		cmd,
+	unsigned long		arg)
+{
+	xfs_fsop_bulkreq_t bulkreq;
+	int		count;		/* # of records returned */
+	xfs_ino_t	inlast;		/* last inode number */
+	int		done;
+	int		error;
+	/* done = 1 if there are more stats to get and if bulkstat */
+	/* should be called again (unused here, but used in dmapi) */
+
+	/* Do not allow space reservation if this is not the admin and
+	 * unwritten extents are turned off.
+	 */
+	if (!XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -XFS_ERROR(EIO);
+
+	if (copy_from_user(&bulkreq, (xfs_fsop_bulkreq_t *)arg,
+					sizeof(xfs_fsop_bulkreq_t)))
+		return -XFS_ERROR(EFAULT);
+
+	if (copy_from_user(&inlast, (__s64 *)bulkreq.lastip,
+						sizeof(__s64)))
+		return -XFS_ERROR(EFAULT);
+
+	if ((count = bulkreq.icount) <= 0)
+		return -XFS_ERROR(EINVAL);
+
+	if (cmd == XFS_IOC_FSINUMBERS)
+		error = xfs_inumbers(mp, NULL, &inlast, &count,
+						bulkreq.ubuffer);
+	else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE)
+		error = xfs_bulkstat_single(mp, &inlast,
+						bulkreq.ubuffer, &done);
+	else {	/* XFS_IOC_FSBULKSTAT */
+		if (count == 1 && inlast != 0) {
+			inlast++;
+			error = xfs_bulkstat_single(mp, &inlast,
+					bulkreq.ubuffer, &done);
+		} else {
+			error = xfs_bulkstat(mp, NULL, &inlast, &count,
+				(bulkstat_one_pf)xfs_bulkstat_one,
+				sizeof(xfs_bstat_t), bulkreq.ubuffer,
+				BULKSTAT_FG_QUICK, &done);
+		}
+	}
+
+	if (error)
+		return -error;
+
+	if (bulkreq.ocount != NULL) {
+		if (copy_to_user((xfs_ino_t *)bulkreq.lastip, &inlast,
+						sizeof(xfs_ino_t)))
+			return -XFS_ERROR(EFAULT);
+
+		if (copy_to_user((__s32 *)bulkreq.ocount, &count,
+						sizeof(count)))
+			return -XFS_ERROR(EFAULT);
+	}
+
+	return 0;
+}
+
+int xfs_ioc_fsgeometry_v1(
+	xfs_mount_t		*mp,
+	unsigned long		arg)
+{
+	xfs_fsop_geom_v1_t	fsgeo;
+	int			error;
+
+	error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3);
+	if (error)
+		return -error;
+
+	if (copy_to_user((xfs_fsop_geom_t *)arg, &fsgeo, sizeof(fsgeo)))
+		return -XFS_ERROR(EFAULT);
+	return 0;
+}
+
+int xfs_ioc_fsgeometry(
+	xfs_mount_t		*mp,
+	unsigned long		arg)
+{
+	xfs_fsop_geom_t fsgeo;
+	int		error;
+
+	error = xfs_fs_geometry(mp, &fsgeo, 4);
+	if (error)
+		return -error;
+
+	if (copy_to_user((xfs_fsop_geom_t *)arg, &fsgeo, sizeof(fsgeo)))
+		return -XFS_ERROR(EFAULT);
+	return 0;
+}
+
+int xfs_ioc_xattr(
+	vnode_t			*vp,
+	struct file		*filp,
+	unsigned int		cmd,
+	unsigned long		arg)
+{
+	struct fsxattr	fa;
+	vattr_t		va;
+	int		error;
+
+	switch (cmd) {
+	case XFS_IOC_FSGETXATTR: {
+		va.va_mask = AT_XFLAGS|AT_EXTSIZE|AT_NEXTENTS;
+		VOP_GETATTR(vp, &va, 0, NULL, error);
+		if (error)
+			return -error;
+
+		fa.fsx_xflags	= va.va_xflags;
+		fa.fsx_extsize	= va.va_extsize;
+		fa.fsx_nextents = va.va_nextents;
+
+		if (copy_to_user((struct fsxattr *)arg, &fa, sizeof(fa)))
+			return -XFS_ERROR(EFAULT);
+		return 0;
+	}
+
+	case XFS_IOC_FSSETXATTR: {
+		int		attr_flags = 0;
+
+		if (copy_from_user(&fa, (struct fsxattr *)arg, sizeof(fa)))
+			return -XFS_ERROR(EFAULT);
+
+		va.va_mask = AT_XFLAGS | AT_EXTSIZE;
+		va.va_xflags  = fa.fsx_xflags;
+		va.va_extsize = fa.fsx_extsize;
+
+		if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+			attr_flags |= ATTR_NONBLOCK;
+
+		VOP_SETATTR(vp, &va, attr_flags, NULL, error);
+		return -error;
+	}
+
+	case XFS_IOC_FSGETXATTRA: {
+
+		va.va_mask = AT_XFLAGS|AT_EXTSIZE|AT_ANEXTENTS;
+		VOP_GETATTR(vp, &va, 0, NULL, error);
+		if (error)
+			return -error;
+
+		fa.fsx_xflags	= va.va_xflags;
+		fa.fsx_extsize	= va.va_extsize;
+		fa.fsx_nextents = va.va_anextents;
+
+		if (copy_to_user((struct fsxattr *)arg, &fa, sizeof(fa)))
+			return -XFS_ERROR(EFAULT);
+		return 0;
+	}
+	
+	default:
+		return -ENOTTY;
+
+	}
+}
+
+int xfs_ioc_getbmap(
+	bhv_desc_t		*bdp,
+	struct file		*filp,
+	unsigned int		cmd,
+	unsigned long		arg)
+{
+	struct getbmap	bm;
+	int		iflags;
+	int		error;
+
+	if (copy_from_user(&bm, (struct getbmap *)arg, sizeof(bm)))
+		return -XFS_ERROR(EFAULT);
+
+	if (bm.bmv_count < 2)
+		return -XFS_ERROR(EINVAL);
+
+	iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
+	if (filp->f_mode & FINVIS)
+		iflags |= BMV_IF_NO_DMAPI_READ;
+
+	error = xfs_getbmap(bdp, &bm, (struct getbmap *)arg+1, iflags);
+	if (error)
+		return -error;
+
+	if (copy_to_user((struct getbmap *)arg, &bm, sizeof(bm)))
+		return -XFS_ERROR(EFAULT);
+	return 0;
+}
+
+int xfs_ioc_getbmapx(
+	bhv_desc_t		*bdp,
+	unsigned long		arg)
+{
+	struct getbmapx bmx;
+	struct getbmap	bm;
+	int		iflags;
+	int		error;
+
+	if (copy_from_user(&bmx, (struct getbmapx *)arg, sizeof(bmx)))
+		return -XFS_ERROR(EFAULT);
+
+	if (bmx.bmv_count < 2)
+		return -XFS_ERROR(EINVAL);
+
+	/*
+	 * Map input getbmapx structure to a getbmap
+	 * structure for xfs_getbmap.
+	 */
+	GETBMAP_CONVERT(bmx, bm);
+
+	iflags = bmx.bmv_iflags;
+
+	if (iflags & (~BMV_IF_VALID))
+		return -XFS_ERROR(EINVAL);
+
+	iflags |= BMV_IF_EXTENDED;
+
+	error = xfs_getbmap(bdp, &bm, (struct getbmapx *)arg+1, iflags);
+	if (error)
+		return -error;
+
+	GETBMAP_CONVERT(bm, bmx);
+
+	if (copy_to_user((struct getbmapx *)arg, &bmx, sizeof(bmx)))
+		return -XFS_ERROR(EFAULT);
+
+	return 0;
+}
diff --git a/fs/xfs/linux/xfs_iops.c b/fs/xfs/linux/xfs_iops.c
new file mode 100644
index 000000000000..3beca5b8fd34
--- /dev/null
+++ b/fs/xfs/linux/xfs_iops.c
@@ -0,0 +1,875 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <linux/xattr.h>
+
+
+/*
+ * Pull the link count and size up from the xfs inode to the linux inode
+ */
+STATIC void
+validate_fields(
+	struct inode	*ip)
+{
+	vnode_t		*vp = LINVFS_GET_VP(ip);
+	vattr_t		va;
+	int		error;
+
+	va.va_mask = AT_NLINK|AT_SIZE;
+	VOP_GETATTR(vp, &va, ATTR_LAZY, NULL, error);
+	ip->i_nlink = va.va_nlink;
+	ip->i_size = va.va_size;
+	ip->i_blocks = va.va_nblocks;
+}
+
+#ifdef CONFIG_FS_POSIX_ACL
+/*
+ * Determine whether a process has a valid fs_struct (kernel daemons
+ * like knfsd don't have an fs_struct).
+ */
+STATIC int inline
+has_fs_struct(struct task_struct *task)
+{
+	return (task->fs != init_task.fs);
+}
+#endif
+
+STATIC int
+linvfs_mknod(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	int		mode,
+	int		rdev)
+{
+	struct inode	*ip;
+	vattr_t		va;
+	vnode_t		*vp = NULL, *dvp = LINVFS_GET_VP(dir);
+	xattr_exists_t	test_default_acl = _ACL_DEFAULT_EXISTS;
+	int		have_default_acl = 0;
+	int		error = EINVAL;
+
+	if (test_default_acl)
+		have_default_acl = test_default_acl(dvp);
+
+#ifdef CONFIG_FS_POSIX_ACL
+	/*
+	 * Conditionally compiled so that the ACL base kernel changes can be
+	 * split out into separate patches - remove this once MS_POSIXACL is
+	 * accepted, or some other way to implement this exists.
+	 */
+	if (IS_POSIXACL(dir) && !have_default_acl && has_fs_struct(current))
+		mode &= ~current->fs->umask;
+#endif
+
+	bzero(&va, sizeof(va));
+	va.va_mask = AT_TYPE|AT_MODE;
+	va.va_type = IFTOVT(mode);
+	va.va_mode = mode;
+
+	switch (mode & S_IFMT) {
+	case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK:
+		va.va_rdev = rdev;
+		va.va_mask |= AT_RDEV;
+		/*FALLTHROUGH*/
+	case S_IFREG:
+		VOP_CREATE(dvp, dentry, &va, &vp, NULL, error);
+		break;
+	case S_IFDIR:
+		VOP_MKDIR(dvp, dentry, &va, &vp, NULL, error);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	if (!error) {
+		ASSERT(vp);
+		ip = LINVFS_GET_IP(vp);
+		if (!ip) {
+			VN_RELE(vp);
+			return -ENOMEM;
+		}
+
+		if (S_ISCHR(mode) || S_ISBLK(mode))
+			ip->i_rdev = to_kdev_t(rdev);
+		/* linvfs_revalidate_core returns (-) errors */
+		error = -linvfs_revalidate_core(ip, ATTR_COMM);
+		validate_fields(dir);
+		d_instantiate(dentry, ip);
+		mark_inode_dirty_sync(ip);
+		mark_inode_dirty_sync(dir);
+	}
+
+	if (!error && have_default_acl) {
+		_ACL_DECL	(pdacl);
+
+		if (!_ACL_ALLOC(pdacl)) {
+			error = -ENOMEM;
+		} else {
+			if (_ACL_GET_DEFAULT(dvp, pdacl))
+				error = _ACL_INHERIT(vp, &va, pdacl);
+			VMODIFY(vp);
+			_ACL_FREE(pdacl);
+		}
+	}
+	return -error;
+}
+
+STATIC int
+linvfs_create(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	int		mode)
+{
+	return linvfs_mknod(dir, dentry, mode, 0);
+}
+
+STATIC int
+linvfs_mkdir(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	int		mode)
+{
+	return linvfs_mknod(dir, dentry, mode|S_IFDIR, 0);
+}
+
+
+STATIC struct dentry *
+linvfs_lookup(
+	struct inode	*dir,
+	struct dentry	*dentry)
+{
+	int		error;
+	vnode_t		*vp, *cvp;
+	struct inode	*ip = NULL;
+
+	if (dentry->d_name.len >= MAXNAMELEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	cvp = NULL;
+	vp = LINVFS_GET_VP(dir);
+	VOP_LOOKUP(vp, dentry, &cvp, 0, NULL, NULL, error);
+	if (!error) {
+		ASSERT(cvp);
+		ip = LINVFS_GET_IP(cvp);
+		if (!ip) {
+			VN_RELE(cvp);
+			return ERR_PTR(-EACCES);
+		}
+		error = -linvfs_revalidate_core(ip, ATTR_COMM);
+	}
+	if (error && (error != ENOENT))
+		return ERR_PTR(-error);
+	return d_splice_alias(ip, dentry);
+}
+
+STATIC int
+linvfs_link(
+	struct dentry	*old_dentry,
+	struct inode	*dir,
+	struct dentry	*dentry)
+{
+	int		error;
+	vnode_t		*tdvp;	/* Target directory for new name/link */
+	vnode_t		*vp;	/* vp of name being linked */
+	struct inode	*ip;	/* inode of guy being linked to */
+
+	ip = old_dentry->d_inode;	/* inode being linked to */
+	if (S_ISDIR(ip->i_mode))
+		return -EPERM;
+
+	tdvp = LINVFS_GET_VP(dir);
+	vp = LINVFS_GET_VP(ip);
+
+	error = 0;
+	VOP_LINK(tdvp, vp, dentry, NULL, error);
+	if (!error) {
+		VMODIFY(tdvp);
+		VN_HOLD(vp);
+		validate_fields(ip);
+		d_instantiate(dentry, ip);
+		mark_inode_dirty_sync(ip);
+	}
+	return -error;
+}
+
+STATIC int
+linvfs_unlink(
+	struct inode	*dir,
+	struct dentry	*dentry)
+{
+	int		error = 0;
+	struct inode	*inode;
+	vnode_t		*dvp;	/* directory containing name to remove */
+
+	inode = dentry->d_inode;
+
+	dvp = LINVFS_GET_VP(dir);
+
+	VOP_REMOVE(dvp, dentry, NULL, error);
+
+	if (!error) {
+		validate_fields(dir);	/* For size only */
+		validate_fields(inode);
+		mark_inode_dirty_sync(inode);
+		mark_inode_dirty_sync(dir);
+	}
+
+	return -error;
+}
+
+STATIC int
+linvfs_symlink(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	const char	*symname)
+{
+	int		error;
+	vnode_t		*dvp;	/* directory containing name to remove */
+	vnode_t		*cvp;	/* used to lookup symlink to put in dentry */
+	vattr_t		va;
+	struct inode	*ip = NULL;
+
+	dvp = LINVFS_GET_VP(dir);
+
+	bzero(&va, sizeof(va));
+	va.va_type = VLNK;
+	va.va_mode = 0777 & ~current->fs->umask;
+	va.va_mask = AT_TYPE|AT_MODE; /* AT_PROJID? */
+
+	error = 0;
+	VOP_SYMLINK(dvp, dentry, &va, (char *)symname,
+							&cvp, NULL, error);
+	if (!error) {
+		ASSERT(cvp);
+		ASSERT(cvp->v_type == VLNK);
+		ip = LINVFS_GET_IP(cvp);
+		if (!ip) {
+			error = ENOMEM;
+			VN_RELE(cvp);
+		} else {
+			/* linvfs_revalidate_core returns (-) errors */
+			error = -linvfs_revalidate_core(ip, ATTR_COMM);
+			d_instantiate(dentry, ip);
+			validate_fields(dir);
+			mark_inode_dirty_sync(ip);
+			mark_inode_dirty_sync(dir);
+		}
+	}
+	return -error;
+}
+
+STATIC int
+linvfs_rmdir(
+	struct inode	*dir,
+	struct dentry	*dentry)
+{
+	struct inode	*inode = dentry->d_inode;
+	vnode_t		*dvp = LINVFS_GET_VP(dir);
+	int		error;
+
+	VOP_RMDIR(dvp, dentry, NULL, error);
+	if (!error) {
+		validate_fields(inode);
+		validate_fields(dir);
+		mark_inode_dirty_sync(inode);
+		mark_inode_dirty_sync(dir);
+	}
+	return -error;
+}
+
+STATIC int
+linvfs_rename(
+	struct inode	*odir,
+	struct dentry	*odentry,
+	struct inode	*ndir,
+	struct dentry	*ndentry)
+{
+	int		error;
+	vnode_t		*fvp;	/* from directory */
+	vnode_t		*tvp;	/* target directory */
+	struct inode	*new_inode = NULL;
+
+	fvp = LINVFS_GET_VP(odir);
+	tvp = LINVFS_GET_VP(ndir);
+
+	new_inode = ndentry->d_inode;
+
+	VOP_RENAME(fvp, odentry, tvp, ndentry, NULL, error);
+	if (error)
+		return -error;
+
+	if (new_inode) {
+		validate_fields(new_inode);
+	}
+
+	validate_fields(odir);
+	if (ndir != odir)
+		validate_fields(ndir);
+	mark_inode_dirty(ndir);
+	return 0;
+}
+
+STATIC int
+linvfs_readlink(
+	struct dentry	*dentry,
+	char		*buf,
+	int		size)
+{
+	vnode_t		*vp;
+	uio_t		uio;
+	iovec_t		iov;
+	int		error;
+
+	vp = LINVFS_GET_VP(dentry->d_inode);
+
+	iov.iov_base = buf;
+	iov.iov_len = size;
+
+	uio.uio_iov = &iov;
+	uio.uio_offset = 0;
+	uio.uio_segflg = UIO_USERSPACE;
+	uio.uio_resid = size;
+
+	VOP_READLINK(vp, &uio, NULL, error);
+	if (error)
+		return -error;
+
+	return (size - uio.uio_resid);
+}
+
+/*
+ * careful here - this function can get called recusively, so
+ * we need to be very careful about how much stack we use.
+ * uio is kmalloced for this reason...
+ */
+STATIC int
+linvfs_follow_link(
+	struct dentry		*dentry,
+	struct nameidata	*nd)
+{
+	vnode_t			*vp;
+	uio_t			*uio;
+	iovec_t			iov;
+	int			error;
+	char			*link;
+
+	ASSERT(dentry);
+	ASSERT(nd);
+
+	link = (char *)kmalloc(MAXNAMELEN+1, GFP_KERNEL);
+	if (!link)
+		return -ENOMEM;
+
+	uio = (uio_t *)kmalloc(sizeof(uio_t), GFP_KERNEL);
+	if (!uio) {
+		kfree(link);
+		return -ENOMEM;
+	}
+
+	vp = LINVFS_GET_VP(dentry->d_inode);
+
+	iov.iov_base = link;
+	iov.iov_len = MAXNAMELEN;
+
+	uio->uio_iov = &iov;
+	uio->uio_offset = 0;
+	uio->uio_segflg = UIO_SYSSPACE;
+	uio->uio_resid = MAXNAMELEN;
+	uio->uio_fmode = 0;
+
+	VOP_READLINK(vp, uio, NULL, error);
+	if (error) {
+		kfree(uio);
+		kfree(link);
+		return -error;
+	}
+
+	link[MAXNAMELEN - uio->uio_resid] = '\0';
+	kfree(uio);
+
+	/* vfs_follow_link returns (-) errors */
+	error = vfs_follow_link(nd, link);
+	kfree(link);
+	return error;
+}
+
+STATIC int
+linvfs_permission(
+	struct inode	*inode,
+	int		mode)
+{
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+	int		error;
+
+	mode <<= 6;		/* convert from linux to vnode access bits */
+	VOP_ACCESS(vp, mode, NULL, error);
+	return -error;
+}
+
+/* Brute force approach for now - copy data into linux inode
+ * from the results of a getattr. This gets called out of things
+ * like stat.
+ */
+int
+linvfs_revalidate_core(
+	struct inode	*inode,
+	int		flags)
+{
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+
+	/* vn_revalidate returns (-) error so this is ok */
+	return vn_revalidate(vp, flags);
+}
+
+STATIC int
+linvfs_getattr(
+	struct vfsmount	*mnt,
+	struct dentry	*dentry,
+	struct kstat	*stat)
+{
+	struct inode	*inode = dentry->d_inode;
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+	int		error = 0;
+
+	if (unlikely(vp->v_flag & VMODIFIED)) {
+		error = linvfs_revalidate_core(inode, 0);
+	}
+	if (!error)
+		generic_fillattr(inode, stat);
+	return 0;
+}
+
+STATIC int
+linvfs_setattr(
+	struct dentry	*dentry,
+	struct iattr	*attr)
+{
+	struct inode	*inode = dentry->d_inode;
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+	vattr_t		vattr;
+	unsigned int	ia_valid = attr->ia_valid;
+	int		error;
+	int		flags = 0;
+
+	memset(&vattr, 0, sizeof(vattr_t));
+	if (ia_valid & ATTR_UID) {
+		vattr.va_mask |= AT_UID;
+		vattr.va_uid = attr->ia_uid;
+	}
+	if (ia_valid & ATTR_GID) {
+		vattr.va_mask |= AT_GID;
+		vattr.va_gid = attr->ia_gid;
+	}
+	if (ia_valid & ATTR_SIZE) {
+		vattr.va_mask |= AT_SIZE;
+		vattr.va_size = attr->ia_size;
+	}
+	if (ia_valid & ATTR_ATIME) {
+		vattr.va_mask |= AT_ATIME;
+		vattr.va_atime.tv_sec = attr->ia_atime;
+		vattr.va_atime.tv_nsec = 0;
+	}
+	if (ia_valid & ATTR_MTIME) {
+		vattr.va_mask |= AT_MTIME;
+		vattr.va_mtime.tv_sec = attr->ia_mtime;
+		vattr.va_mtime.tv_nsec = 0;
+	}
+	if (ia_valid & ATTR_CTIME) {
+		vattr.va_mask |= AT_CTIME;
+		vattr.va_ctime.tv_sec = attr->ia_ctime;
+		vattr.va_ctime.tv_nsec = 0;
+	}
+	if (ia_valid & ATTR_MODE) {
+		vattr.va_mask |= AT_MODE;
+		vattr.va_mode = attr->ia_mode;
+		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+			inode->i_mode &= ~S_ISGID;
+	}
+
+	if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET))
+		flags = ATTR_UTIME;
+
+	VOP_SETATTR(vp, &vattr, flags, NULL, error);
+	if (error)
+		return(-error); /* Positive error up from XFS */
+	if (ia_valid & ATTR_SIZE) {
+		error = vmtruncate(inode, attr->ia_size);
+	}
+
+	if (!error) {
+		vn_revalidate(vp, 0);
+		mark_inode_dirty_sync(inode);
+	}
+	return error;
+}
+
+STATIC void
+linvfs_truncate(
+	struct inode		*inode)
+{
+	block_truncate_page(inode->i_mapping, inode->i_size, linvfs_get_block);
+}
+
+
+
+/*
+ * Extended attributes interfaces
+ */
+
+#define SYSTEM_NAME	"system."	/* VFS shared names/values */
+#define ROOT_NAME	"xfsroot."	/* XFS ondisk names/values */
+#define USER_NAME	"user."		/* user's own names/values */
+STATIC xattr_namespace_t xfs_namespace_array[] = {
+	{ .name= SYSTEM_NAME,	.namelen= sizeof(SYSTEM_NAME)-1,.exists= NULL },
+	{ .name= ROOT_NAME,	.namelen= sizeof(ROOT_NAME)-1,	.exists= NULL },
+	{ .name= USER_NAME,	.namelen= sizeof(USER_NAME)-1,	.exists= NULL },
+	{ .name= NULL }
+};
+xattr_namespace_t *xfs_namespaces = &xfs_namespace_array[0];
+
+#define POSIXACL_ACCESS		"posix_acl_access"
+#define POSIXACL_ACCESS_SIZE	(sizeof(POSIXACL_ACCESS)-1)
+#define POSIXACL_DEFAULT	"posix_acl_default"
+#define POSIXACL_DEFAULT_SIZE	(sizeof(POSIXACL_DEFAULT)-1)
+#define POSIXCAP		"posix_capabilities"
+#define POSIXCAP_SIZE		(sizeof(POSIXCAP)-1)
+#define POSIXMAC		"posix_mac"
+#define POSIXMAC_SIZE		(sizeof(POSIXMAC)-1)
+STATIC xattr_namespace_t sys_namespace_array[] = {
+	{ .name= POSIXACL_ACCESS,
+		.namelen= POSIXACL_ACCESS_SIZE,	 .exists= _ACL_ACCESS_EXISTS },
+	{ .name= POSIXACL_DEFAULT,
+		.namelen= POSIXACL_DEFAULT_SIZE, .exists= _ACL_DEFAULT_EXISTS },
+	{ .name= POSIXCAP,
+		.namelen= POSIXCAP_SIZE,	 .exists= _CAP_EXISTS },
+	{ .name= POSIXMAC,
+		.namelen= POSIXMAC_SIZE,	 .exists= _MAC_EXISTS },
+	{ .name= NULL }
+};
+
+/*
+ * Some checks to prevent people abusing EAs to get over quota:
+ * - Don't allow modifying user EAs on devices/symlinks;
+ * - Don't allow modifying user EAs if sticky bit set;
+ */
+STATIC int
+capable_user_xattr(
+	struct inode	*inode)
+{
+	if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) &&
+	    !capable(CAP_SYS_ADMIN))
+		return 0;
+	if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
+	    (current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+		return 0;
+	return 1;
+}
+
+STATIC int
+linvfs_setxattr(
+	struct dentry	*dentry,
+	const char	*name,
+	void		*data,
+	size_t		size,
+	int		flags)
+{
+	int		error;
+	int		xflags = 0;
+	char		*p = (char *)name;
+	struct inode	*inode = dentry->d_inode;
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+
+	if (strncmp(name, xfs_namespaces[SYSTEM_NAMES].name,
+			xfs_namespaces[SYSTEM_NAMES].namelen) == 0) {
+		error = -EINVAL;
+		if (flags & XATTR_CREATE)
+			 return error;
+		error = -ENOATTR;
+		p += xfs_namespaces[SYSTEM_NAMES].namelen;
+		if (strcmp(p, POSIXACL_ACCESS) == 0) {
+			if (vp->v_flag & VMODIFIED) {
+				error = linvfs_revalidate_core(inode, 0);
+				if (error)
+					return error;
+			}
+			error = xfs_acl_vset(vp, data, size, _ACL_TYPE_ACCESS);
+			if (!error) {
+				VMODIFY(vp);
+				error = linvfs_revalidate_core(inode, 0);
+			}
+		}
+		else if (strcmp(p, POSIXACL_DEFAULT) == 0) {
+			error = linvfs_revalidate_core(inode, 0);
+			if (error)
+				return error;
+			error = xfs_acl_vset(vp, data, size, _ACL_TYPE_DEFAULT);
+			if (!error) {
+				VMODIFY(vp);
+				error = linvfs_revalidate_core(inode, 0);
+			}
+		}
+		else if (strcmp(p, POSIXCAP) == 0) {
+			error = xfs_cap_vset(vp, data, size);
+		}
+		return error;
+	}
+
+	/* Convert Linux syscall to XFS internal ATTR flags */
+	if (flags & XATTR_CREATE)
+		xflags |= ATTR_CREATE;
+	if (flags & XATTR_REPLACE)
+		xflags |= ATTR_REPLACE;
+
+	if (strncmp(name, xfs_namespaces[ROOT_NAMES].name,
+			xfs_namespaces[ROOT_NAMES].namelen) == 0) {
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		xflags |= ATTR_ROOT;
+		p += xfs_namespaces[ROOT_NAMES].namelen;
+		VOP_ATTR_SET(vp, p, data, size, xflags, NULL, error);
+		return -error;
+	}
+	if (strncmp(name, xfs_namespaces[USER_NAMES].name,
+			xfs_namespaces[USER_NAMES].namelen) == 0) {
+		if (!capable_user_xattr(inode))
+			return -EPERM;
+		p += xfs_namespaces[USER_NAMES].namelen;
+		VOP_ATTR_SET(vp, p, data, size, xflags, NULL, error);
+		return -error;
+	}
+	return -ENOATTR;
+}
+
+STATIC ssize_t
+linvfs_getxattr(
+	struct dentry	*dentry,
+	const char	*name,
+	void		*data,
+	size_t		size)
+{
+	ssize_t		error;
+	int		xflags = 0;
+	char		*p = (char *)name;
+	struct inode	*inode = dentry->d_inode;
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+
+	if (strncmp(name, xfs_namespaces[SYSTEM_NAMES].name,
+			xfs_namespaces[SYSTEM_NAMES].namelen) == 0) {
+		error = -ENOATTR;
+		p += xfs_namespaces[SYSTEM_NAMES].namelen;
+		if (strcmp(p, POSIXACL_ACCESS) == 0) {
+			if (vp->v_flag & VMODIFIED) {
+				error = linvfs_revalidate_core(inode, 0);
+				if (error)
+					return error;
+			}
+			error = xfs_acl_vget(vp, data, size, _ACL_TYPE_ACCESS);
+		}
+		else if (strcmp(p, POSIXACL_DEFAULT) == 0) {
+			if (vp->v_flag & VMODIFIED) {
+				error = linvfs_revalidate_core(inode, 0);
+				if (error)
+					return error;
+			}
+			error = xfs_acl_vget(vp, data, size, _ACL_TYPE_DEFAULT);
+		}
+		else if (strcmp(p, POSIXCAP) == 0) {
+			error = xfs_cap_vget(vp, data, size);
+		}
+		return error;
+	}
+
+	/* Convert Linux syscall to XFS internal ATTR flags */
+	if (!size)
+		xflags |= ATTR_KERNOVAL;
+
+	if (strncmp(name, xfs_namespaces[ROOT_NAMES].name,
+			xfs_namespaces[ROOT_NAMES].namelen) == 0) {
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		xflags |= ATTR_ROOT;
+		p += xfs_namespaces[ROOT_NAMES].namelen;
+		VOP_ATTR_GET(vp, p, data, (int *)&size, xflags, NULL, error);
+		if (!error)
+			error = -size;
+		return -error;
+	}
+	if (strncmp(name, xfs_namespaces[USER_NAMES].name,
+			xfs_namespaces[USER_NAMES].namelen) == 0) {
+		p += xfs_namespaces[USER_NAMES].namelen;
+		if (!capable_user_xattr(inode))
+			return -EPERM;
+		VOP_ATTR_GET(vp, p, data, (int *)&size, xflags, NULL, error);
+		if (!error)
+			error = -size;
+		return -error;
+	}
+	return -ENOATTR;
+}
+
+
+STATIC ssize_t
+linvfs_listxattr(
+	struct dentry		*dentry,
+	char			*data,
+	size_t			size)
+{
+	ssize_t			error;
+	int			result = 0;
+	int			xflags = ATTR_KERNAMELS;
+	char			*k = data;
+	attrlist_cursor_kern_t	cursor;
+	xattr_namespace_t	*sys;
+	vnode_t			*vp;
+
+	vp = LINVFS_GET_VP(dentry->d_inode);
+
+	if (!size)
+		xflags |= ATTR_KERNOVAL;
+	if (capable(CAP_SYS_ADMIN))
+		xflags |= ATTR_KERNFULLS;
+
+	memset(&cursor, 0, sizeof(cursor));
+	VOP_ATTR_LIST(vp, data, size, xflags, &cursor, NULL, error);
+	if (error > 0)
+		return -error;
+	result += -error;
+
+	k += result;		/* advance start of our buffer */
+	for (sys = &sys_namespace_array[0]; sys->name != NULL; sys++) {
+		if (sys->exists == NULL || !sys->exists(vp))
+			continue;
+		result += xfs_namespaces[SYSTEM_NAMES].namelen;
+		result += sys->namelen + 1;
+		if (size) {
+			if (result > size)
+				return -ERANGE;
+			strcpy(k, xfs_namespaces[SYSTEM_NAMES].name);
+			k += xfs_namespaces[SYSTEM_NAMES].namelen;
+			strcpy(k, sys->name);
+			k += sys->namelen + 1;
+		}
+	}
+	return result;
+}
+
+STATIC int
+linvfs_removexattr(
+	struct dentry	*dentry,
+	const char	*name)
+{
+	int		error;
+	int		xflags = 0;
+	char		*p = (char *)name;
+	struct inode	*inode = dentry->d_inode;
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+
+	if (strncmp(name, xfs_namespaces[SYSTEM_NAMES].name,
+			xfs_namespaces[SYSTEM_NAMES].namelen) == 0) {
+		error = -ENOATTR;
+		p += xfs_namespaces[SYSTEM_NAMES].namelen;
+		if (strcmp(p, POSIXACL_ACCESS) == 0)
+			error = xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
+		else if (strcmp(p, POSIXACL_DEFAULT) == 0)
+			error = xfs_acl_vremove(vp, _ACL_TYPE_DEFAULT);
+		else if (strcmp(p, POSIXCAP) == 0)
+			error = xfs_cap_vremove(vp);
+		return error;
+	}
+
+	if (strncmp(name, xfs_namespaces[ROOT_NAMES].name,
+			xfs_namespaces[ROOT_NAMES].namelen) == 0) {
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		xflags |= ATTR_ROOT;
+		p += xfs_namespaces[ROOT_NAMES].namelen;
+		VOP_ATTR_REMOVE(vp, p, xflags, NULL, error);
+		return -error;
+	}
+	if (strncmp(name, xfs_namespaces[USER_NAMES].name,
+			xfs_namespaces[USER_NAMES].namelen) == 0) {
+		p += xfs_namespaces[USER_NAMES].namelen;
+		if (!capable_user_xattr(inode))
+			return -EPERM;
+		VOP_ATTR_REMOVE(vp, p, xflags, NULL, error);
+		return -error;
+	}
+	return -ENOATTR;
+}
+
+
+struct inode_operations linvfs_file_inode_operations =
+{
+	.permission		= linvfs_permission,
+	.getattr		= linvfs_getattr,
+	.truncate		= linvfs_truncate,
+	.setattr		= linvfs_setattr,
+	.setxattr		= linvfs_setxattr,
+	.getxattr		= linvfs_getxattr,
+	.listxattr		= linvfs_listxattr,
+	.removexattr		= linvfs_removexattr,
+};
+
+struct inode_operations linvfs_dir_inode_operations =
+{
+	.create			= linvfs_create,
+	.lookup			= linvfs_lookup,
+	.link			= linvfs_link,
+	.unlink			= linvfs_unlink,
+	.symlink		= linvfs_symlink,
+	.mkdir			= linvfs_mkdir,
+	.rmdir			= linvfs_rmdir,
+	.mknod			= linvfs_mknod,
+	.rename			= linvfs_rename,
+	.permission		= linvfs_permission,
+	.getattr		= linvfs_getattr,
+	.setattr		= linvfs_setattr,
+	.setxattr		= linvfs_setxattr,
+	.getxattr		= linvfs_getxattr,
+	.listxattr		= linvfs_listxattr,
+	.removexattr		= linvfs_removexattr,
+};
+
+struct inode_operations linvfs_symlink_inode_operations =
+{
+	.readlink		= linvfs_readlink,
+	.follow_link		= linvfs_follow_link,
+	.permission		= linvfs_permission,
+	.getattr		= linvfs_getattr,
+	.setattr		= linvfs_setattr,
+	.setxattr		= linvfs_setxattr,
+	.getxattr		= linvfs_getxattr,
+	.listxattr		= linvfs_listxattr,
+	.removexattr		= linvfs_removexattr,
+};
diff --git a/fs/xfs/linux/xfs_iops.h b/fs/xfs/linux/xfs_iops.h
new file mode 100644
index 000000000000..3c4529374aec
--- /dev/null
+++ b/fs/xfs/linux/xfs_iops.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_IOPS_H__
+#define __XFS_IOPS_H__
+
+/*
+ * Extended system attributes.
+ * So far only POSIX ACLs are supported, but this will need to
+ * grow in time (capabilities, mandatory access control, etc).
+ */
+#define XFS_SYSTEM_NAMESPACE	SYSTEM_POSIXACL
+
+/*
+ * Define a table of the namespaces XFS supports
+ */
+typedef int (*xattr_exists_t)(vnode_t *);
+
+typedef struct xattr_namespace {
+	char		*name;
+	unsigned int	namelen;
+	xattr_exists_t	exists;
+} xattr_namespace_t;
+
+#define SYSTEM_NAMES	0
+#define ROOT_NAMES	1
+#define USER_NAMES	2
+extern struct xattr_namespace *xfs_namespaces;
+
+
+extern struct inode_operations linvfs_file_inode_operations;
+extern struct inode_operations linvfs_dir_inode_operations;
+extern struct inode_operations linvfs_symlink_inode_operations;
+
+extern struct file_operations linvfs_file_operations;
+extern struct file_operations linvfs_dir_operations;
+
+extern struct address_space_operations linvfs_aops;
+
+extern int linvfs_revalidate_core(struct inode *, int);
+extern int linvfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
+
+#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/linux/xfs_linux.h b/fs/xfs/linux/xfs_linux.h
new file mode 100644
index 000000000000..7def3bb302b8
--- /dev/null
+++ b/fs/xfs/linux/xfs_linux.h
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_LINUX__
+#define __XFS_LINUX__
+
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/swap.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/major.h>
+#include <linux/root_dev.h>
+
+#include <asm/page.h>
+#include <asm/div64.h>
+#include <asm/param.h>
+#include <asm/uaccess.h>
+#include <asm/byteorder.h>
+
+#include <linux/xfs_behavior.h>
+#include <linux/xfs_vfs.h>
+#include <linux/xfs_cred.h>
+#include <linux/xfs_vnode.h>
+#include <linux/xfs_stats.h>
+#include <linux/xfs_sysctl.h>
+#include <linux/xfs_iops.h>
+#include <linux/xfs_super.h>
+#include <linux/xfs_globals.h>
+#include <linux/xfs_fs_subr.h>
+
+#include <pagebuf/page_buf.h>
+
+#ifndef STATIC
+#define STATIC static
+#endif
+
+typedef struct xfs_dirent {		/* data from readdir() */
+	xfs_ino_t	d_ino;		/* inode number of entry */
+	xfs_off_t	d_off;		/* offset of disk directory entry */
+	unsigned short	d_reclen;	/* length of this record */
+	char		d_name[1];	/* name of file */
+} xfs_dirent_t;
+
+#define DIRENTBASESIZE		(((xfs_dirent_t *)0)->d_name - (char *)0)
+#define DIRENTSIZE(namelen)	\
+	((DIRENTBASESIZE + (namelen) + \
+		sizeof(xfs_off_t)) & ~(sizeof(xfs_off_t) - 1))
+
+#define NBPP		PAGE_SIZE
+#define DPPSHFT		(PAGE_SHIFT - 9)
+#define NDPP		(1 << (PAGE_SHIFT - 9))
+#define dtop(DD)	(((DD) + NDPP - 1) >> DPPSHFT)
+#define dtopt(DD)	((DD) >> DPPSHFT)
+#define dpoff(DD)	((DD) & (NDPP-1))
+
+#define NBBY		8		/* number of bits per byte */
+#define NBPC		PAGE_SIZE	/* Number of bytes per click */
+#define BPCSHIFT	PAGE_SHIFT	/* LOG2(NBPC) if exact */
+
+/*
+ * Size of block device i/o is parameterized here.
+ * Currently the system supports page-sized i/o.
+ */
+#define BLKDEV_IOSHIFT		BPCSHIFT
+#define BLKDEV_IOSIZE		(1<<BLKDEV_IOSHIFT)
+/* number of BB's per block device block */
+#define BLKDEV_BB		BTOBB(BLKDEV_IOSIZE)
+
+/* bytes to clicks */
+#define btoc(x)		(((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT)
+#define btoct(x)	((__psunsigned_t)(x)>>BPCSHIFT)
+#define btoc64(x)	(((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT)
+#define btoct64(x)	((__uint64_t)(x)>>BPCSHIFT)
+#define io_btoc(x)	(((__psunsigned_t)(x)+(IO_NBPC-1))>>IO_BPCSHIFT)
+#define io_btoct(x)	((__psunsigned_t)(x)>>IO_BPCSHIFT)
+
+/* off_t bytes to clicks */
+#define offtoc(x)	(((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT)
+#define offtoct(x)	((xfs_off_t)(x)>>BPCSHIFT)
+
+/* clicks to off_t bytes */
+#define ctooff(x)	((xfs_off_t)(x)<<BPCSHIFT)
+
+/* clicks to bytes */
+#define ctob(x)		((__psunsigned_t)(x)<<BPCSHIFT)
+#define btoct(x)	((__psunsigned_t)(x)>>BPCSHIFT)
+#define ctob64(x)	((__uint64_t)(x)<<BPCSHIFT)
+#define io_ctob(x)	((__psunsigned_t)(x)<<IO_BPCSHIFT)
+
+/* bytes to clicks */
+#define btoc(x)		(((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT)
+
+#ifndef CELL_CAPABLE
+#define FSC_NOTIFY_NAME_CHANGED(vp)
+#endif
+
+#ifndef ENOATTR
+#define ENOATTR		ENODATA		/* Attribute not found */
+#endif
+
+/* Note: EWRONGFS never visible outside the kernel */
+#define EWRONGFS	EINVAL		/* Mount with wrong filesystem type */
+
+/*
+ * XXX EFSCORRUPTED needs a real value in errno.h. asm-i386/errno.h won't
+ *     return codes out of its known range in errno.
+ * XXX Also note: needs to be < 1000 and fairly unique on Linux (mustn't
+ *     conflict with any code we use already or any code a driver may use)
+ * XXX Some options (currently we do #2):
+ *	1/ New error code ["Filesystem is corrupted", _after_ glibc updated]
+ *	2/ 990 ["Unknown error 990"]
+ *	3/ EUCLEAN ["Structure needs cleaning"]
+ *	4/ Convert EFSCORRUPTED to EIO [just prior to return into userspace]
+ */
+#define EFSCORRUPTED	990		/* Filesystem is corrupted */
+
+#define SYNCHRONIZE()	barrier()
+#define lbolt		jiffies
+#define rootdev		ROOT_DEV
+#define __return_address __builtin_return_address(0)
+#define LONGLONG_MAX	9223372036854775807LL	/* max "long long int" */
+#define nopkg()		( ENOSYS )
+
+/* IRIX uses a dynamic sizing algorithm (ndquot = 200 + numprocs*2) */
+/* we may well need to fine-tune this if it ever becomes an issue.  */
+#define DQUOT_MAX_HEURISTIC	1024	/* NR_DQUOTS */
+#define ndquot			DQUOT_MAX_HEURISTIC
+
+/* IRIX uses the current size of the name cache to guess a good value */
+/* - this isn't the same but is a good enough starting point for now. */
+#define DQUOT_HASH_HEURISTIC	files_stat.nr_files
+
+/* IRIX inodes maintain the project ID also, zero this field on Linux */
+#define DEFAULT_PROJID	0
+#define dfltprid	DEFAULT_PROJID
+
+#define MAXNAMELEN	256
+#define MAXPATHLEN	1024
+
+#define FINVIS		0x0100	/* don't update timestamps - XFS */
+
+#define MIN(a,b)	(min(a,b))
+#define MAX(a,b)	(max(a,b))
+#define howmany(x, y)	(((x)+((y)-1))/(y))
+#define roundup(x, y)	((((x)+((y)-1))/(y))*(y))
+
+/* Move the kernel do_div definition off to one side */
+
+#if defined __i386__
+/* For ia32 we need to pull some tricks to get past various versions
+ * of the compiler which do not like us using do_div in the middle
+ * of large functions.
+ */
+static inline __u32 xfs_do_div(void *a, __u32 b, int n)
+{
+	__u32	mod;
+
+	switch (n) {
+		case 4:
+			mod = *(__u32 *)a % b;
+			*(__u32 *)a = *(__u32 *)a / b;
+			return mod;
+		case 8:
+			{
+			unsigned long __upper, __low, __high, __mod;
+			__u64	c = *(__u64 *)a;
+			__upper = __high = c >> 32;
+			__low = c;
+			if (__high) {
+				__upper = __high % (b);
+				__high = __high / (b);
+			}
+			asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
+			asm("":"=A" (c):"a" (__low),"d" (__high));
+			*(__u64 *)a = c;
+			return __mod;
+			}
+	}
+
+	/* NOTREACHED */
+	return 0;
+}
+
+/* Side effect free 64 bit mod operation */
+static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
+{
+	switch (n) {
+		case 4:
+			return *(__u32 *)a % b;
+		case 8:
+			{
+			unsigned long __upper, __low, __high, __mod;
+			__u64	c = *(__u64 *)a;
+			__upper = __high = c >> 32;
+			__low = c;
+			if (__high) {
+				__upper = __high % (b);
+				__high = __high / (b);
+			}
+			asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
+			asm("":"=A" (c):"a" (__low),"d" (__high));
+			return __mod;
+			}
+	}
+
+	/* NOTREACHED */
+	return 0;
+}
+#else
+static inline __u32 xfs_do_div(void *a, __u32 b, int n)
+{
+	__u32	mod;
+
+	switch (n) {
+		case 4:
+			mod = *(__u32 *)a % b;
+			*(__u32 *)a = *(__u32 *)a / b;
+			return mod;
+		case 8:
+			mod = do_div(*(__u64 *)a, b);
+			return mod;
+	}
+
+	/* NOTREACHED */
+	return 0;
+}
+
+/* Side effect free 64 bit mod operation */
+static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
+{
+	switch (n) {
+		case 4:
+			return *(__u32 *)a % b;
+		case 8:
+			{
+			__u64	c = *(__u64 *)a;
+			return do_div(c, b);
+			}
+	}
+
+	/* NOTREACHED */
+	return 0;
+}
+#endif
+
+#undef do_div
+#define do_div(a, b)	xfs_do_div(&(a), (b), sizeof(a))
+#define do_mod(a, b)	xfs_do_mod(&(a), (b), sizeof(a))
+
+static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
+{
+	x += y - 1;
+	do_div(x, y);
+	return(x * y);
+}
+
+#endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/linux/xfs_lrw.c b/fs/xfs/linux/xfs_lrw.c
new file mode 100644
index 000000000000..b338349bf5b5
--- /dev/null
+++ b/fs/xfs/linux/xfs_lrw.c
@@ -0,0 +1,1813 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ *  fs/xfs/linux/xfs_lrw.c (Linux Read Write stuff)
+ *
+ */
+
+#include <xfs.h>
+#include <linux/pagemap.h>
+#include <linux/capability.h>
+
+
+#define XFS_WRITEIO_ALIGN(mp,off)	(((off) >> mp->m_writeio_log) \
+						<< mp->m_writeio_log)
+#define XFS_STRAT_WRITE_IMAPS	2
+
+STATIC int xfs_iomap_read(xfs_iocore_t *, loff_t, size_t, int, pb_bmap_t *,
+			int *, struct pm *);
+STATIC int xfs_iomap_write(xfs_iocore_t *, loff_t, size_t, pb_bmap_t *,
+			int *, int, struct pm *);
+STATIC int xfs_iomap_write_delay(xfs_iocore_t *, loff_t, size_t, pb_bmap_t *,
+			int *, int, int);
+STATIC int xfs_iomap_write_direct(xfs_iocore_t *, loff_t, size_t, pb_bmap_t *,
+			int *, int, int);
+STATIC int _xfs_imap_to_bmap(xfs_iocore_t *, xfs_off_t, xfs_bmbt_irec_t *,
+			pb_bmap_t *, int, int);
+
+
+/*
+ *	xfs_iozero
+ *
+ *	xfs_iozero clears the specified range of buffer supplied,
+ *	and marks all the affected blocks as valid and modified.  If
+ *	an affected block is not allocated, it will be allocated.  If
+ *	an affected block is not completely overwritten, and is not
+ *	valid before the operation, it will be read from disk before
+ *	being partially zeroed.
+ */
+STATIC int
+xfs_iozero(
+	struct inode		*ip,	/* inode 			*/
+	loff_t			pos,	/* offset in file		*/
+	size_t			count,	/* size of data to zero		*/
+	loff_t			end_size)	/* max file size to set */
+{
+	unsigned		bytes;
+	struct page		*page;
+	struct address_space	*mapping;
+	char			*kaddr;
+	int			status;
+
+	mapping = ip->i_mapping;
+	do {
+		unsigned long index, offset;
+
+		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+		index = pos >> PAGE_CACHE_SHIFT;
+		bytes = PAGE_CACHE_SIZE - offset;
+		if (bytes > count)
+			bytes = count;
+
+		status = -ENOMEM;
+		page = grab_cache_page(mapping, index);
+		if (!page)
+			break;
+
+		kaddr = kmap(page);
+		status = mapping->a_ops->prepare_write(NULL, page, offset,
+							offset + bytes);
+		if (status) {
+			goto unlock;
+		}
+
+		memset((void *) (kaddr + offset), 0, bytes);
+		flush_dcache_page(page);
+		status = mapping->a_ops->commit_write(NULL, page, offset,
+							offset + bytes);
+		if (!status) {
+			pos += bytes;
+			count -= bytes;
+			if (pos > ip->i_size)
+				ip->i_size = pos < end_size ? pos : end_size;
+		}
+
+unlock:
+		kunmap(page);
+		unlock_page(page);
+		page_cache_release(page);
+		if (status)
+			break;
+	} while (count);
+
+	return (-status);
+}
+
+ssize_t			/* bytes read, or (-)  error */
+xfs_read(
+	bhv_desc_t	*bdp,
+	struct file	*file,
+	char		*buf,
+	size_t		size,
+	loff_t		*offset,
+	cred_t		*credp)
+{
+	ssize_t		ret;
+	xfs_fsize_t	n;
+	xfs_inode_t	*ip;
+	xfs_mount_t	*mp;
+
+	ip = XFS_BHVTOI(bdp);
+	mp = ip->i_mount;
+
+	XFS_STATS_INC(xfsstats.xs_read_calls);
+
+	if (file->f_flags & O_DIRECT) {
+		if (((__psint_t)buf & BBMASK) ||
+		    (*offset & mp->m_blockmask) ||
+		    (size & mp->m_blockmask)) {
+			if (*offset == ip->i_d.di_size) {
+				return (0);
+			}
+			return -XFS_ERROR(EINVAL);
+		}
+	}
+
+
+	n = XFS_MAX_FILE_OFFSET - *offset;
+	if ((n <= 0) || (size == 0))
+		return 0;
+
+	if (n < size)
+		size = n;
+
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		return -EIO;
+	}
+
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
+
+	if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) &&
+	    !(file->f_mode & FINVIS)) {
+		int error;
+		vrwlock_t locktype = VRWLOCK_READ;
+
+		error = xfs_dm_send_data_event(DM_EVENT_READ, bdp,
+					     *offset, size,
+					     FILP_DELAY_FLAG(file),
+					     &locktype);
+		if (error) {
+			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+			return -error;
+		}
+	}
+
+	ret = generic_file_read(file, buf, size, offset);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+	XFS_STATS_ADD(xfsstats.xs_read_bytes, ret);
+
+	if (!(file->f_mode & FINVIS))
+		xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
+
+	return ret;
+}
+
+/*
+ * This routine is called to handle zeroing any space in the last
+ * block of the file that is beyond the EOF.  We do this since the
+ * size is being increased without writing anything to that block
+ * and we don't want anyone to read the garbage on the disk.
+ */
+STATIC int				/* error (positive) */
+xfs_zero_last_block(
+	struct inode	*ip,
+	xfs_iocore_t	*io,
+	xfs_off_t	offset,
+	xfs_fsize_t	isize,
+	xfs_fsize_t	end_size,
+	struct pm	*pmp)
+{
+	xfs_fileoff_t	last_fsb;
+	xfs_mount_t	*mp;
+	int		nimaps;
+	int		zero_offset;
+	int		zero_len;
+	int		isize_fsb_offset;
+	int		error = 0;
+	xfs_bmbt_irec_t imap;
+	loff_t		loff;
+	size_t		lsize;
+
+	ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0);
+	ASSERT(offset > isize);
+
+	mp = io->io_mount;
+
+	isize_fsb_offset = XFS_B_FSB_OFFSET(mp, isize);
+	if (isize_fsb_offset == 0) {
+		/*
+		 * There are no extra bytes in the last block on disk to
+		 * zero, so return.
+		 */
+		return 0;
+	}
+
+	last_fsb = XFS_B_TO_FSBT(mp, isize);
+	nimaps = 1;
+	error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap,
+			  &nimaps, NULL);
+	if (error) {
+		return error;
+	}
+	ASSERT(nimaps > 0);
+	/*
+	 * If the block underlying isize is just a hole, then there
+	 * is nothing to zero.
+	 */
+	if (imap.br_startblock == HOLESTARTBLOCK) {
+		return 0;
+	}
+	/*
+	 * Get a pagebuf for the last block, zero the part beyond the
+	 * EOF, and write it out sync.	We need to drop the ilock
+	 * while we do this so we don't deadlock when the buffer cache
+	 * calls back to us.
+	 */
+	XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD);
+	loff = XFS_FSB_TO_B(mp, last_fsb);
+	lsize = XFS_FSB_TO_B(mp, 1);
+
+	zero_offset = isize_fsb_offset;
+	zero_len = mp->m_sb.sb_blocksize - isize_fsb_offset;
+
+	error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size);
+
+	XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+	ASSERT(error >= 0);
+	return error;
+}
+
+/*
+ * Zero any on disk space between the current EOF and the new,
+ * larger EOF.	This handles the normal case of zeroing the remainder
+ * of the last block in the file and the unusual case of zeroing blocks
+ * out beyond the size of the file.  This second case only happens
+ * with fixed size extents and when the system crashes before the inode
+ * size was updated but after blocks were allocated.  If fill is set,
+ * then any holes in the range are filled and zeroed.  If not, the holes
+ * are left alone as holes.
+ */
+
+int					/* error (positive) */
+xfs_zero_eof(
+	vnode_t		*vp,
+	xfs_iocore_t	*io,
+	xfs_off_t	offset,		/* starting I/O offset */
+	xfs_fsize_t	isize,		/* current inode size */
+	xfs_fsize_t	end_size,	/* terminal inode size */
+	struct pm	*pmp)
+{
+	struct inode	*ip = LINVFS_GET_IP(vp);
+	xfs_fileoff_t	start_zero_fsb;
+	xfs_fileoff_t	end_zero_fsb;
+	xfs_fileoff_t	prev_zero_fsb;
+	xfs_fileoff_t	zero_count_fsb;
+	xfs_fileoff_t	last_fsb;
+	xfs_extlen_t	buf_len_fsb;
+	xfs_extlen_t	prev_zero_count;
+	xfs_mount_t	*mp;
+	int		nimaps;
+	int		error = 0;
+	xfs_bmbt_irec_t imap;
+	loff_t		loff;
+	size_t		lsize;
+
+	ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
+	ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
+
+	mp = io->io_mount;
+
+	/*
+	 * First handle zeroing the block on which isize resides.
+	 * We only zero a part of that block so it is handled specially.
+	 */
+	error = xfs_zero_last_block(ip, io, offset, isize, end_size, pmp);
+	if (error) {
+		ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
+		ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
+		return error;
+	}
+
+	/*
+	 * Calculate the range between the new size and the old
+	 * where blocks needing to be zeroed may exist.	 To get the
+	 * block where the last byte in the file currently resides,
+	 * we need to subtract one from the size and truncate back
+	 * to a block boundary.	 We subtract 1 in case the size is
+	 * exactly on a block boundary.
+	 */
+	last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
+	start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
+	end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
+
+	ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
+	if (last_fsb == end_zero_fsb) {
+		/*
+		 * The size was only incremented on its last block.
+		 * We took care of that above, so just return.
+		 */
+		return 0;
+	}
+
+	ASSERT(start_zero_fsb <= end_zero_fsb);
+	prev_zero_fsb = NULLFILEOFF;
+	prev_zero_count = 0;
+	/*
+	 * Maybe change this loop to do the bmapi call and
+	 * loop while we split the mappings into pagebufs?
+	 */
+	while (start_zero_fsb <= end_zero_fsb) {
+		nimaps = 1;
+		zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
+		error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb,
+				  0, NULL, 0, &imap, &nimaps, NULL);
+		if (error) {
+			ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
+			ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
+			return error;
+		}
+		ASSERT(nimaps > 0);
+
+		if (imap.br_startblock == HOLESTARTBLOCK) {
+			/*
+			 * This loop handles initializing pages that were
+			 * partially initialized by the code below this
+			 * loop. It basically zeroes the part of the page
+			 * that sits on a hole and sets the page as P_HOLE
+			 * and calls remapf if it is a mapped file.
+			 */
+			prev_zero_fsb = NULLFILEOFF;
+			prev_zero_count = 0;
+			start_zero_fsb = imap.br_startoff +
+					 imap.br_blockcount;
+			ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+			continue;
+		}
+
+		/*
+		 * There are blocks in the range requested.
+		 * Zero them a single write at a time.	We actually
+		 * don't zero the entire range returned if it is
+		 * too big and simply loop around to get the rest.
+		 * That is not the most efficient thing to do, but it
+		 * is simple and this path should not be exercised often.
+		 */
+		buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount,
+					      mp->m_writeio_blocks << 8);
+		/*
+		 * Drop the inode lock while we're doing the I/O.
+		 * We'll still have the iolock to protect us.
+		 */
+		XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+
+		loff = XFS_FSB_TO_B(mp, start_zero_fsb);
+		lsize = XFS_FSB_TO_B(mp, buf_len_fsb);
+
+		error = xfs_iozero(ip, loff, lsize, end_size);
+
+		if (error) {
+			goto out_lock;
+		}
+
+		prev_zero_fsb = start_zero_fsb;
+		prev_zero_count = buf_len_fsb;
+		start_zero_fsb = imap.br_startoff + buf_len_fsb;
+		ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+
+		XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+	}
+
+	return 0;
+
+out_lock:
+
+	XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+	ASSERT(error >= 0);
+	return error;
+}
+
+ssize_t				/* bytes written, or (-) error */
+xfs_write(
+	bhv_desc_t	*bdp,
+	struct file	*file,
+	const char	*buf,
+	size_t		size,
+	loff_t		*offset,
+	cred_t		*credp)
+{
+	xfs_inode_t	*xip;
+	xfs_mount_t	*mp;
+	ssize_t		ret;
+	int		error = 0;
+	xfs_fsize_t	isize, new_size;
+	xfs_fsize_t	n, limit = XFS_MAX_FILE_OFFSET;
+	xfs_iocore_t	*io;
+	vnode_t		*vp;
+	int		iolock;
+	int		direct = file->f_flags & O_DIRECT;
+	int		eventsent = 0;
+	vrwlock_t	locktype;
+
+	XFS_STATS_INC(xfsstats.xs_write_calls);
+
+	vp = BHV_TO_VNODE(bdp);
+	xip = XFS_BHVTOI(bdp);
+
+	if (size == 0)
+		return 0;
+
+	io = &(xip->i_iocore);
+	mp = io->io_mount;
+
+	xfs_check_frozen(mp, bdp, XFS_FREEZE_WRITE);
+
+	if (XFS_FORCED_SHUTDOWN(xip->i_mount)) {
+		return -EIO;
+	}
+
+	if (direct) {
+		if (((__psint_t)buf & BBMASK) ||
+		    (*offset & mp->m_blockmask) ||
+		    (size  & mp->m_blockmask)) {
+			return XFS_ERROR(-EINVAL);
+		}
+		iolock = XFS_IOLOCK_SHARED;
+		locktype = VRWLOCK_WRITE_DIRECT;
+	} else {
+		iolock = XFS_IOLOCK_EXCL;
+		locktype = VRWLOCK_WRITE;
+	}
+
+	xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
+	isize = xip->i_d.di_size;
+
+	if (file->f_flags & O_APPEND)
+		*offset = isize;
+
+start:
+	n = limit - *offset;
+	if (n <= 0) {
+		xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
+		return -EFBIG;
+	}
+	if (n < size)
+		size = n;
+
+	new_size = *offset + size;
+	if (new_size > isize) {
+		io->io_new_size = new_size;
+	}
+
+	if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
+	    !(file->f_mode & FINVIS) && !eventsent)) {
+		loff_t		savedsize = *offset;
+
+		xfs_iunlock(xip, XFS_ILOCK_EXCL);
+		error = xfs_dm_send_data_event(DM_EVENT_WRITE, bdp,
+				*offset, size,
+				FILP_DELAY_FLAG(file), &locktype);
+		if (error) {
+			xfs_iunlock(xip, iolock);
+			return -error;
+		}
+		xfs_ilock(xip, XFS_ILOCK_EXCL);
+		eventsent = 1;
+
+		/*
+		 * The iolock was dropped and reaquired in
+		 * xfs_dm_send_data_event so we have to recheck the size
+		 *  when appending.  We will only "goto start;" once,
+		 *  since having sent the event prevents another call
+		 *  to xfs_dm_send_data_event, which is what
+		 *  allows the size to change in the first place.
+		 */
+		if ((file->f_flags & O_APPEND) &&
+		    savedsize != xip->i_d.di_size) {
+			*offset = isize = xip->i_d.di_size;
+			goto start;
+		}
+	}
+
+	/*
+	 * On Linux, generic_file_write updates the times even if
+	 * no data is copied in so long as the write had a size.
+	 *
+	 * We must update xfs' times since revalidate will overcopy xfs.
+	 */
+	if (size) {
+		if (!(file->f_mode & FINVIS))
+			xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	}
+
+	/*
+	 * If the offset is beyond the size of the file, we have a couple
+	 * of things to do. First, if there is already space allocated
+	 * we need to either create holes or zero the disk or ...
+	 *
+	 * If there is a page where the previous size lands, we need
+	 * to zero it out up to the new size.
+	 */
+
+	if (!direct && (*offset > isize && isize)) {
+		error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset,
+			isize, *offset + size, NULL);
+		if (error) {
+			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
+			return(-error);
+		}
+	}
+	xfs_iunlock(xip, XFS_ILOCK_EXCL);
+
+	/*
+	 * If we're writing the file then make sure to clear the
+	 * setuid and setgid bits if the process is not being run
+	 * by root.  This keeps people from modifying setuid and
+	 * setgid binaries.
+	 */
+
+	if (((xip->i_d.di_mode & ISUID) ||
+	    ((xip->i_d.di_mode & (ISGID | (IEXEC >> 3))) ==
+		(ISGID | (IEXEC >> 3)))) &&
+	     !capable(CAP_FSETID)) {
+		error = xfs_write_clear_setuid(xip);
+		if (error) {
+			xfs_iunlock(xip, iolock);
+			return -error;
+		}
+	}
+
+retry:
+	if (direct) {
+		xfs_inval_cached_pages(vp, &xip->i_iocore, *offset, 1, 1);
+	}
+
+	ret = generic_file_write_nolock(file, buf, size, offset);
+
+	if ((ret == -ENOSPC) &&
+	    DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
+	    !(file->f_mode & FINVIS)) {
+
+		xfs_rwunlock(bdp, locktype);
+		error = dm_send_namesp_event(DM_EVENT_NOSPACE, bdp,
+				DM_RIGHT_NULL, bdp, DM_RIGHT_NULL, NULL, NULL,
+				0, 0, 0); /* Delay flag intentionally  unused */
+		if (error)
+			return -error;
+		xfs_rwlock(bdp, locktype);
+		*offset = xip->i_d.di_size;
+		goto retry;
+
+	}
+
+	if (ret <= 0) {
+		xfs_rwunlock(bdp, locktype);
+		return ret;
+	}
+
+	XFS_STATS_ADD(xfsstats.xs_write_bytes, ret);
+
+	if (*offset > xip->i_d.di_size) {
+		xfs_ilock(xip, XFS_ILOCK_EXCL);
+		if (*offset > xip->i_d.di_size) {
+			struct inode	*inode = LINVFS_GET_IP(vp);
+
+			inode->i_size = xip->i_d.di_size = *offset;
+			xip->i_update_core = 1;
+			xip->i_update_size = 1;
+		}
+		xfs_iunlock(xip, XFS_ILOCK_EXCL);
+	}
+
+	/* Handle various SYNC-type writes */
+	if ((file->f_flags & O_SYNC) || IS_SYNC(file->f_dentry->d_inode)) {
+
+		/*
+		 * If we're treating this as O_DSYNC and we have not updated the
+		 * size, force the log.
+		 */
+
+		if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC)
+			&& !(xip->i_update_size)) {
+			/*
+			 * If an allocation transaction occurred
+			 * without extending the size, then we have to force
+			 * the log up the proper point to ensure that the
+			 * allocation is permanent.  We can't count on
+			 * the fact that buffered writes lock out direct I/O
+			 * writes - the direct I/O write could have extended
+			 * the size nontransactionally, then finished before
+			 * we started.	xfs_write_file will think that the file
+			 * didn't grow but the update isn't safe unless the
+			 * size change is logged.
+			 *
+			 * Force the log if we've committed a transaction
+			 * against the inode or if someone else has and
+			 * the commit record hasn't gone to disk (e.g.
+			 * the inode is pinned).  This guarantees that
+			 * all changes affecting the inode are permanent
+			 * when we return.
+			 */
+
+			xfs_inode_log_item_t *iip;
+			xfs_lsn_t lsn;
+
+			iip = xip->i_itemp;
+			if (iip && iip->ili_last_lsn) {
+				lsn = iip->ili_last_lsn;
+				xfs_log_force(mp, lsn,
+						XFS_LOG_FORCE | XFS_LOG_SYNC);
+			} else if (xfs_ipincount(xip) > 0) {
+				xfs_log_force(mp, (xfs_lsn_t)0,
+						XFS_LOG_FORCE | XFS_LOG_SYNC);
+			}
+
+		} else {
+			xfs_trans_t	*tp;
+
+			/*
+			 * O_SYNC or O_DSYNC _with_ a size update are handled
+			 * the same way.
+			 *
+			 * If the write was synchronous then we need to make
+			 * sure that the inode modification time is permanent.
+			 * We'll have updated the timestamp above, so here
+			 * we use a synchronous transaction to log the inode.
+			 * It's not fast, but it's necessary.
+			 *
+			 * If this a dsync write and the size got changed
+			 * non-transactionally, then we need to ensure that
+			 * the size change gets logged in a synchronous
+			 * transaction.
+			 */
+
+			tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
+			if ((error = xfs_trans_reserve(tp, 0,
+						      XFS_SWRITE_LOG_RES(mp),
+						      0, 0, 0))) {
+				/* Transaction reserve failed */
+				xfs_trans_cancel(tp, 0);
+			} else {
+				/* Transaction reserve successful */
+				xfs_ilock(xip, XFS_ILOCK_EXCL);
+				xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
+				xfs_trans_ihold(tp, xip);
+				xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
+				xfs_trans_set_sync(tp);
+				error = xfs_trans_commit(tp, 0, (xfs_lsn_t)0);
+				xfs_iunlock(xip, XFS_ILOCK_EXCL);
+			}
+		}
+	} /* (ioflags & O_SYNC) */
+
+	/*
+	 * If we are coming from an nfsd thread then insert into the
+	 * reference cache.
+	 */
+
+	if (!strcmp(current->comm, "nfsd"))
+		xfs_refcache_insert(xip);
+
+	/* Drop lock this way - the old refcache release is in here */
+	xfs_rwunlock(bdp, locktype);
+
+	return(ret);
+}
+
+/*
+ * xfs_bmap() is the same as the irix xfs_bmap from xfs_rw.c
+ * execpt for slight changes to the params
+ */
+int
+xfs_bmap(bhv_desc_t	*bdp,
+	xfs_off_t	offset,
+	ssize_t		count,
+	int		flags,
+	struct cred	*cred,
+	pb_bmap_t	*pbmapp,
+	int		*npbmaps)
+{
+	xfs_inode_t	*ip;
+	int		error;
+	int		lockmode;
+	int		fsynced = 0;
+	vnode_t		*vp;
+
+	ip = XFS_BHVTOI(bdp);
+	ASSERT((ip->i_d.di_mode & IFMT) == IFREG);
+	ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
+	       ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0));
+	ASSERT((flags & PBF_READ) || (flags & PBF_WRITE));
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_iocore.io_mount))
+		return XFS_ERROR(EIO);
+
+	if (flags & PBF_READ) {
+		lockmode = xfs_ilock_map_shared(ip);
+		error = xfs_iomap_read(&ip->i_iocore, offset, count,
+				 XFS_BMAPI_ENTIRE, pbmapp, npbmaps, NULL);
+		xfs_iunlock_map_shared(ip, lockmode);
+	} else { /* PBF_WRITE */
+		ASSERT(flags & PBF_WRITE);
+		vp = BHV_TO_VNODE(bdp);
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+		/*
+		 * Make sure that the dquots are there. This doesn't hold
+		 * the ilock across a disk read.
+		 */
+
+		if (XFS_IS_QUOTA_ON(ip->i_mount)) {
+			if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
+				if ((error = xfs_qm_dqattach(ip, XFS_QMOPT_ILOCKED))) {
+					xfs_iunlock(ip, XFS_ILOCK_EXCL);
+					return XFS_ERROR(error);
+				}
+			}
+		}
+retry:
+		error = xfs_iomap_write(&ip->i_iocore, offset, count,
+					pbmapp, npbmaps, flags, NULL);
+		/* xfs_iomap_write unlocks/locks/unlocks */
+
+		if (error == ENOSPC) {
+			switch (fsynced) {
+			case 0:
+				if (ip->i_delayed_blks) {
+					filemap_fdatawrite(LINVFS_GET_IP(vp)->i_mapping);
+					fsynced = 1;
+				} else {
+					fsynced = 2;
+					flags |= PBF_SYNC;
+				}
+				error = 0;
+				xfs_ilock(ip, XFS_ILOCK_EXCL);
+				goto retry;
+			case 1:
+				fsynced = 2;
+				if (!(flags & PBF_SYNC)) {
+					flags |= PBF_SYNC;
+					error = 0;
+					xfs_ilock(ip, XFS_ILOCK_EXCL);
+					goto retry;
+				}
+			case 2:
+				sync_blockdev(vp->v_vfsp->vfs_super->s_bdev);
+				xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
+						XFS_LOG_FORCE|XFS_LOG_SYNC);
+
+				error = 0;
+/**
+				delay(HZ);
+**/
+				fsynced++;
+				xfs_ilock(ip, XFS_ILOCK_EXCL);
+				goto retry;
+			}
+		}
+	}
+
+	return XFS_ERROR(error);
+}
+
+int
+xfs_strategy(bhv_desc_t *bdp,
+	xfs_off_t	offset,
+	ssize_t		count,
+	int		flags,
+	struct cred	*cred,
+	pb_bmap_t	*pbmapp,
+	int		*npbmaps)
+{
+	xfs_inode_t	*ip;
+	xfs_iocore_t	*io;
+	xfs_mount_t	*mp;
+	int		error;
+	xfs_fileoff_t	offset_fsb;
+	xfs_fileoff_t	end_fsb;
+	xfs_fileoff_t	map_start_fsb;
+	xfs_fileoff_t	last_block;
+	xfs_fsblock_t	first_block;
+	xfs_bmap_free_t free_list;
+	xfs_filblks_t	count_fsb;
+	int		committed, i, loops, nimaps;
+	int		is_xfs = 1; /* This will be a variable at some point */
+	xfs_bmbt_irec_t imap[XFS_MAX_RW_NBMAPS];
+	xfs_trans_t	*tp;
+
+	ip = XFS_BHVTOI(bdp);
+	io = &ip->i_iocore;
+	mp = ip->i_mount;
+	/* is_xfs = IO_IS_XFS(io); */
+	ASSERT((ip->i_d.di_mode & IFMT) == IFREG);
+	ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
+	       ((io->io_flags & XFS_IOCORE_RT) != 0));
+	ASSERT((flags & PBF_READ) || (flags & PBF_WRITE));
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	ASSERT(flags & PBF_WRITE);
+
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	nimaps = min(XFS_MAX_RW_NBMAPS, *npbmaps);
+	end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+	first_block = NULLFSBLOCK;
+
+	XFS_ILOCK(mp, io, XFS_ILOCK_SHARED | XFS_EXTSIZE_RD);
+	error = XFS_BMAPI(mp, NULL, io, offset_fsb,
+			(xfs_filblks_t)(end_fsb - offset_fsb),
+			XFS_BMAPI_ENTIRE, &first_block, 0, imap,
+			&nimaps, NULL);
+	XFS_IUNLOCK(mp, io, XFS_ILOCK_SHARED | XFS_EXTSIZE_RD);
+	if (error) {
+		return XFS_ERROR(error);
+	}
+
+	if (nimaps && !ISNULLSTARTBLOCK(imap[0].br_startblock)) {
+		*npbmaps = _xfs_imap_to_bmap(&ip->i_iocore, offset, imap,
+				pbmapp, nimaps, *npbmaps);
+		return 0;
+	}
+
+	/*
+	 * Make sure that the dquots are there.
+	 */
+
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if (XFS_NOT_DQATTACHED(mp, ip)) {
+			if ((error = xfs_qm_dqattach(ip, 0))) {
+				return XFS_ERROR(error);
+			}
+		}
+	}
+	XFS_STATS_ADD(xfsstats.xs_xstrat_bytes,
+		XFS_FSB_TO_B(mp, imap[0].br_blockcount));
+
+	offset_fsb = imap[0].br_startoff;
+	count_fsb = imap[0].br_blockcount;
+	map_start_fsb = offset_fsb;
+	while (count_fsb != 0) {
+		/*
+		 * Set up a transaction with which to allocate the
+		 * backing store for the file.	Do allocations in a
+		 * loop until we get some space in the range we are
+		 * interested in.  The other space that might be allocated
+		 * is in the delayed allocation extent on which we sit
+		 * but before our buffer starts.
+		 */
+		nimaps = 0;
+		loops = 0;
+		while (nimaps == 0) {
+			if (is_xfs) {
+				tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
+				error = xfs_trans_reserve(tp, 0,
+						XFS_WRITE_LOG_RES(mp),
+						0, XFS_TRANS_PERM_LOG_RES,
+						XFS_WRITE_LOG_COUNT);
+				if (error) {
+					xfs_trans_cancel(tp, 0);
+					goto error0;
+				}
+				xfs_ilock(ip, XFS_ILOCK_EXCL);
+				xfs_trans_ijoin(tp, ip,
+						XFS_ILOCK_EXCL);
+				xfs_trans_ihold(tp, ip);
+			} else {
+				tp = NULL;
+				XFS_ILOCK(mp, io, XFS_ILOCK_EXCL |
+						XFS_EXTSIZE_WR);
+			}
+
+
+			/*
+			 * Allocate the backing store for the file.
+			 */
+			XFS_BMAP_INIT(&(free_list),
+					&(first_block));
+			nimaps = XFS_STRAT_WRITE_IMAPS;
+
+			/*
+			 * Ensure we don't go beyond eof - it is possible
+			 * the extents changed since we did the read call,
+			 * we dropped the ilock in the interim.
+			 */
+
+			end_fsb = XFS_B_TO_FSB(mp, XFS_SIZE(mp, io));
+			xfs_bmap_last_offset(NULL, ip, &last_block,
+				XFS_DATA_FORK);
+			last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
+			if ((map_start_fsb + count_fsb) > last_block) {
+				count_fsb = last_block - map_start_fsb;
+				if (count_fsb == 0) {
+					if (is_xfs) {
+						xfs_bmap_cancel(&free_list);
+						xfs_trans_cancel(tp,
+						  (XFS_TRANS_RELEASE_LOG_RES |
+							 XFS_TRANS_ABORT));
+					}
+					XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL |
+							    XFS_EXTSIZE_WR);
+					return XFS_ERROR(EAGAIN);
+				}
+			}
+
+			error = XFS_BMAPI(mp, tp, io, map_start_fsb, count_fsb,
+					XFS_BMAPI_WRITE, &first_block, 1,
+					imap, &nimaps, &free_list);
+			if (error) {
+				xfs_bmap_cancel(&free_list);
+				xfs_trans_cancel(tp,
+					(XFS_TRANS_RELEASE_LOG_RES |
+					 XFS_TRANS_ABORT));
+				XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL |
+						    XFS_EXTSIZE_WR);
+
+				goto error0;
+			}
+
+			if (is_xfs) {
+				error = xfs_bmap_finish(&(tp), &(free_list),
+						first_block, &committed);
+				if (error) {
+					xfs_bmap_cancel(&free_list);
+					xfs_trans_cancel(tp,
+						(XFS_TRANS_RELEASE_LOG_RES |
+						XFS_TRANS_ABORT));
+					xfs_iunlock(ip, XFS_ILOCK_EXCL);
+					goto error0;
+				}
+
+				error = xfs_trans_commit(tp,
+						XFS_TRANS_RELEASE_LOG_RES,
+						NULL);
+				if (error) {
+					xfs_iunlock(ip, XFS_ILOCK_EXCL);
+					goto error0;
+				}
+			}
+
+			if (nimaps == 0) {
+				XFS_IUNLOCK(mp, io,
+						XFS_ILOCK_EXCL|XFS_EXTSIZE_WR);
+			} /* else hold 'till we maybe loop again below */
+		}
+
+		/*
+		 * See if we were able to allocate an extent that
+		 * covers at least part of the user's requested size.
+		 */
+
+		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+		for(i = 0; i < nimaps; i++) {
+			int maps;
+			if (offset_fsb >= imap[i].br_startoff &&
+				(offset_fsb < (imap[i].br_startoff + imap[i].br_blockcount))) {
+				XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL | XFS_EXTSIZE_WR);
+				maps = min(nimaps, *npbmaps);
+				*npbmaps = _xfs_imap_to_bmap(io, offset, &imap[i],
+					pbmapp, maps, *npbmaps);
+				XFS_STATS_INC(xfsstats.xs_xstrat_quick);
+				return 0;
+			}
+			count_fsb -= imap[i].br_blockcount; /* for next bmapi,
+								if needed. */
+		}
+
+		/*
+		 * We didn't get an extent the caller can write into so
+		 * loop around and try starting after the last imap we got back.
+		 */
+
+		nimaps--; /* Index of last entry  */
+		ASSERT(nimaps >= 0);
+		ASSERT(offset_fsb >= imap[nimaps].br_startoff + imap[nimaps].br_blockcount);
+		ASSERT(count_fsb);
+		offset_fsb = imap[nimaps].br_startoff + imap[nimaps].br_blockcount;
+		map_start_fsb = offset_fsb;
+		XFS_STATS_INC(xfsstats.xs_xstrat_split);
+		XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_WR);
+	}
+
+	ASSERT(0);	/* Should never get here */
+
+ error0:
+	if (error) {
+		ASSERT(count_fsb != 0);
+		ASSERT(is_xfs || XFS_FORCED_SHUTDOWN(mp));
+	}
+
+	return XFS_ERROR(error);
+}
+
+
+STATIC int
+_xfs_imap_to_bmap(
+	xfs_iocore_t	*io,
+	xfs_off_t	offset,
+	xfs_bmbt_irec_t *imap,
+	pb_bmap_t	*pbmapp,
+	int		imaps,			/* Number of imap entries */
+	int		pbmaps)			/* Number of pbmap entries */
+{
+	xfs_mount_t	*mp;
+	xfs_fsize_t	nisize;
+	int		im, pbm;
+	xfs_fsblock_t	start_block;
+
+	mp = io->io_mount;
+	nisize = XFS_SIZE(mp, io);
+	if (io->io_new_size > nisize)
+		nisize = io->io_new_size;
+
+	for (im=0, pbm=0; im < imaps && pbm < pbmaps; im++,pbmapp++,imap++,pbm++) {
+		pbmapp->pbm_target = io->io_flags & XFS_IOCORE_RT ?
+			mp->m_rtdev_targp :
+			mp->m_ddev_targp;
+		pbmapp->pbm_offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+		pbmapp->pbm_delta = offset - pbmapp->pbm_offset;
+		pbmapp->pbm_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount);
+		pbmapp->pbm_flags = 0;
+
+		start_block = imap->br_startblock;
+		if (start_block == HOLESTARTBLOCK) {
+			pbmapp->pbm_bn = PAGE_BUF_DADDR_NULL;
+			pbmapp->pbm_flags = PBMF_HOLE;
+		} else if (start_block == DELAYSTARTBLOCK) {
+			pbmapp->pbm_bn = PAGE_BUF_DADDR_NULL;
+			pbmapp->pbm_flags = PBMF_DELAY;
+		} else {
+			pbmapp->pbm_bn = XFS_FSB_TO_DB_IO(io, start_block);
+			if (imap->br_state == XFS_EXT_UNWRITTEN)
+				pbmapp->pbm_flags |= PBMF_UNWRITTEN;
+		}
+
+		if ((pbmapp->pbm_offset + pbmapp->pbm_bsize) >= nisize) {
+			pbmapp->pbm_flags |= PBMF_EOF;
+		}
+
+		offset += pbmapp->pbm_bsize - pbmapp->pbm_delta;
+	}
+	return(pbm);	/* Return the number filled */
+}
+
+STATIC int
+xfs_iomap_read(
+	xfs_iocore_t	*io,
+	loff_t		offset,
+	size_t		count,
+	int		flags,
+	pb_bmap_t	*pbmapp,
+	int		*npbmaps,
+	struct pm	*pmp)
+{
+	xfs_fileoff_t	offset_fsb;
+	xfs_fileoff_t	end_fsb;
+	int		nimaps;
+	int		error;
+	xfs_mount_t	*mp;
+	xfs_bmbt_irec_t imap[XFS_MAX_RW_NBMAPS];
+
+	ASSERT(ismrlocked(io->io_lock, MR_UPDATE | MR_ACCESS) != 0);
+/**	ASSERT(ismrlocked(io->io_iolock, MR_UPDATE | MR_ACCESS) != 0); **/
+/*	xfs_iomap_enter_trace(XFS_IOMAP_READ_ENTER, io, offset, count); */
+
+	mp = io->io_mount;
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	nimaps = sizeof(imap) / sizeof(imap[0]);
+	nimaps = min(nimaps, *npbmaps); /* Don't ask for more than caller has */
+	end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+	error = XFS_BMAPI(mp, NULL, io, offset_fsb,
+				(xfs_filblks_t)(end_fsb - offset_fsb),
+				flags, NULL, 0, imap,
+				&nimaps, NULL);
+	if (error) {
+		return XFS_ERROR(error);
+	}
+
+	if(nimaps) {
+		*npbmaps = _xfs_imap_to_bmap(io, offset, imap, pbmapp, nimaps,
+			*npbmaps);
+	} else
+		*npbmaps = 0;
+	return XFS_ERROR(error);
+}
+
+/*
+ * xfs_iomap_write: return pagebuf_bmap_t's telling higher layers
+ *	where to write.
+ * There are 2 main cases:
+ *	1 the extents already exist
+ *	2 must allocate.
+ *	There are 3 cases when we allocate:
+ *		delay allocation (doesn't really allocate or use transactions)
+ *		direct allocation (no previous delay allocation
+ *		convert delay to real allocations
+ */
+
+STATIC int
+xfs_iomap_write(
+	xfs_iocore_t	*io,
+	loff_t		offset,
+	size_t		count,
+	pb_bmap_t	*pbmapp,
+	int		*npbmaps,
+	int		ioflag,
+	struct pm	*pmp)
+{
+	int		maps;
+	int		error = 0;
+	int		found;
+	int		flags = 0;
+
+	maps = *npbmaps;
+	if (!maps)
+		goto out;
+
+	/*
+	 * If we have extents that are allocated for this range,
+	 * return them.
+	 */
+
+	found = 0;
+	error = xfs_iomap_read(io, offset, count, flags, pbmapp, npbmaps, NULL);
+	if (error)
+		goto out;
+
+	/*
+	 * If we found mappings and they can just have data written
+	 * without conversion,
+	 * let the caller write these and call us again.
+	 *
+	 * If we have a HOLE or UNWRITTEN, proceed down lower to
+	 * get the space or to convert to written.
+	 */
+
+	if (*npbmaps) {
+		if (!(pbmapp->pbm_flags & PBMF_HOLE)) {
+			*npbmaps = 1; /* Only checked the first one. */
+					/* We could check more, ... */
+			goto out;
+		}
+	}
+	found = *npbmaps;
+	*npbmaps = maps; /* Restore to original requested */
+
+	if (ioflag & PBF_DIRECT) {
+		error = xfs_iomap_write_direct(io, offset, count, pbmapp,
+					npbmaps, ioflag, found);
+	} else {
+		error = xfs_iomap_write_delay(io, offset, count, pbmapp,
+				npbmaps, ioflag, found);
+	}
+
+out:
+	XFS_IUNLOCK(io->io_mount, io, XFS_ILOCK_EXCL);
+	return XFS_ERROR(error);
+}
+
+/*
+ * Map the given I/O size and I/O alignment over the given extent.
+ * If we're at the end of the file and the underlying extent is
+ * delayed alloc, make sure we extend out to the
+ * next i_writeio_blocks boundary.  Otherwise make sure that we
+ * are confined to the given extent.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_write_bmap(
+	xfs_mount_t	*mp,
+	xfs_iocore_t	*io,
+	xfs_bmbt_irec_t *imapp,
+	pb_bmap_t	*pbmapp,
+	int		iosize,
+	xfs_fileoff_t	ioalign,
+	xfs_fsize_t	isize)
+{
+	__int64_t	extra_blocks;
+	xfs_fileoff_t	size_diff;
+	xfs_fileoff_t	ext_offset;
+	xfs_fsblock_t	start_block;
+	int		length;		/* length of this mapping in blocks */
+	xfs_off_t	offset;		/* logical block offset of this mapping */
+
+	if (ioalign < imapp->br_startoff) {
+		/*
+		 * The desired alignment doesn't end up on this
+		 * extent.  Move up to the beginning of the extent.
+		 * Subtract whatever we drop from the iosize so that
+		 * we stay aligned on iosize boundaries.
+		 */
+		size_diff = imapp->br_startoff - ioalign;
+		iosize -= (int)size_diff;
+		ASSERT(iosize > 0);
+		ext_offset = 0;
+		offset = imapp->br_startoff;
+		pbmapp->pbm_offset = XFS_FSB_TO_B(mp, imapp->br_startoff);
+	} else {
+		/*
+		 * The alignment requested fits on this extent,
+		 * so use it.
+		 */
+		ext_offset = ioalign - imapp->br_startoff;
+		offset = ioalign;
+		pbmapp->pbm_offset = XFS_FSB_TO_B(mp, ioalign);
+	}
+	start_block = imapp->br_startblock;
+	ASSERT(start_block != HOLESTARTBLOCK);
+	if (start_block != DELAYSTARTBLOCK) {
+		pbmapp->pbm_bn = XFS_FSB_TO_DB_IO(io, start_block + ext_offset);
+		if (imapp->br_state == XFS_EXT_UNWRITTEN) {
+			pbmapp->pbm_flags = PBMF_UNWRITTEN;
+		}
+	} else {
+		pbmapp->pbm_bn = PAGE_BUF_DADDR_NULL;
+		pbmapp->pbm_flags = PBMF_DELAY;
+	}
+	pbmapp->pbm_target = io->io_flags & XFS_IOCORE_RT ?
+		mp->m_rtdev_targp :
+		mp->m_ddev_targp;
+	length = iosize;
+
+	/*
+	 * If the iosize from our offset extends beyond the end of
+	 * the extent, then trim down length to match that of the extent.
+	 */
+	extra_blocks = (xfs_off_t)(offset + length) -
+		       (__uint64_t)(imapp->br_startoff +
+				    imapp->br_blockcount);
+	if (extra_blocks > 0) {
+		length -= extra_blocks;
+		ASSERT(length > 0);
+	}
+
+	pbmapp->pbm_bsize = XFS_FSB_TO_B(mp, length);
+}
+
+STATIC int
+xfs_iomap_write_delay(
+	xfs_iocore_t	*io,
+	loff_t		offset,
+	size_t		count,
+	pb_bmap_t	*pbmapp,
+	int		*npbmaps,
+	int		ioflag,
+	int		found)
+{
+	xfs_fileoff_t	offset_fsb;
+	xfs_fileoff_t	ioalign;
+	xfs_fileoff_t	last_fsb;
+	xfs_fileoff_t	start_fsb;
+	xfs_filblks_t	count_fsb;
+	xfs_off_t	aligned_offset;
+	xfs_fsize_t	isize;
+	xfs_fsblock_t	firstblock;
+	__uint64_t	last_page_offset;
+	int		nimaps;
+	int		error;
+	int		n;
+	unsigned int	iosize;
+	short		small_write;
+	xfs_mount_t	*mp;
+#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
+	xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
+	int		aeof;
+
+	ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0);
+
+/*	xfs_iomap_enter_trace(XFS_IOMAP_WRITE_ENTER, io, offset, count); */
+
+	mp = io->io_mount;
+/***
+	ASSERT(! XFS_NOT_DQATTACHED(mp, ip));
+***/
+
+	isize = XFS_SIZE(mp, io);
+	if (io->io_new_size > isize) {
+		isize = io->io_new_size;
+	}
+
+	aeof = 0;
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+	/*
+	 * If the caller is doing a write at the end of the file,
+	 * then extend the allocation (and the buffer used for the write)
+	 * out to the file system's write iosize.  We clean up any extra
+	 * space left over when the file is closed in xfs_inactive().
+	 * We can only do this if we are sure that we will create buffers
+	 * over all of the space we allocate beyond the end of the file.
+	 * Not doing so would allow us to create delalloc blocks with
+	 * no pages in memory covering them.  So, we need to check that
+	 * there are not any real blocks in the area beyond the end of
+	 * the file which we are optimistically going to preallocate. If
+	 * there are then our buffers will stop when they encounter them
+	 * and we may accidentally create delalloc blocks beyond them
+	 * that we never cover with a buffer.  All of this is because
+	 * we are not actually going to write the extra blocks preallocated
+	 * at this point.
+	 *
+	 * We don't bother with this for sync writes, because we need
+	 * to minimize the amount we write for good performance.
+	 */
+	if (!(ioflag & PBF_SYNC) && ((offset + count) > XFS_SIZE(mp, io))) {
+		start_fsb = XFS_B_TO_FSBT(mp,
+				  ((xfs_ufsize_t)(offset + count - 1)));
+		count_fsb = mp->m_writeio_blocks;
+		while (count_fsb > 0) {
+			nimaps = XFS_WRITE_IMAPS;
+			error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb,
+					  0, NULL, 0, imap, &nimaps,
+					  NULL);
+			if (error) {
+				return error;
+			}
+			for (n = 0; n < nimaps; n++) {
+				if ((imap[n].br_startblock != HOLESTARTBLOCK) &&
+				    (imap[n].br_startblock != DELAYSTARTBLOCK)) {
+					goto write_map;
+				}
+				start_fsb += imap[n].br_blockcount;
+				count_fsb -= imap[n].br_blockcount;
+				ASSERT(count_fsb < 0xffff000);
+			}
+		}
+		iosize = mp->m_writeio_blocks;
+		aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
+		ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
+		last_fsb = ioalign + iosize;
+		aeof = 1;
+	}
+ write_map:
+	nimaps = XFS_WRITE_IMAPS;
+	firstblock = NULLFSBLOCK;
+
+	/*
+	 * roundup the allocation request to m_dalign boundary if file size
+	 * is greater that 512K and we are allocating past the allocation eof
+	 */
+	if (mp->m_dalign && (XFS_SIZE(mp, io) >= mp->m_dalign) && aeof) {
+		int eof;
+		xfs_fileoff_t new_last_fsb;
+		new_last_fsb = roundup_64(last_fsb, mp->m_dalign);
+		error = XFS_BMAP_EOF(mp, io, new_last_fsb, XFS_DATA_FORK, &eof);
+		if (error) {
+			return error;
+		}
+		if (eof) {
+			last_fsb = new_last_fsb;
+		}
+	}
+
+	error = XFS_BMAPI(mp, NULL, io, offset_fsb,
+			  (xfs_filblks_t)(last_fsb - offset_fsb),
+			  XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
+			  XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
+			  &nimaps, NULL);
+	/*
+	 * This can be EDQUOT, if nimaps == 0
+	 */
+	if (error) {
+		return XFS_ERROR(error);
+	}
+	/*
+	 * If bmapi returned us nothing, and if we didn't get back EDQUOT,
+	 * then we must have run out of space.
+	 */
+	if (nimaps == 0) {
+/*		xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE,
+				      io, offset, count); */
+		return XFS_ERROR(ENOSPC);
+	}
+
+	if (!(ioflag & PBF_SYNC) ||
+	    ((last_fsb - offset_fsb) >= mp->m_writeio_blocks)) {
+		/*
+		 * For normal or large sync writes, align everything
+		 * into i_writeio_blocks sized chunks.
+		 */
+		iosize = mp->m_writeio_blocks;
+		aligned_offset = XFS_WRITEIO_ALIGN(mp, offset);
+		ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
+		small_write = 0;
+		/* XXX - Are we shrinking? XXXXX  */
+	} else {
+		/*
+		 * For small sync writes try to minimize the amount
+		 * of I/O we do.  Round down and up to the larger of
+		 * page or block boundaries.  Set the small_write
+		 * variable to 1 to indicate to the code below that
+		 * we are not using the normal buffer alignment scheme.
+		 */
+		if (NBPP > mp->m_sb.sb_blocksize) {
+			aligned_offset = ctooff(offtoct(offset));
+			ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
+			last_page_offset = ctob64(btoc64(offset + count));
+			iosize = XFS_B_TO_FSBT(mp, last_page_offset -
+					       aligned_offset);
+		} else {
+			ioalign = offset_fsb;
+			iosize = last_fsb - offset_fsb;
+		}
+		small_write = 1;
+		/* XXX - Are we shrinking? XXXXX  */
+	}
+
+	/*
+	 * Now map our desired I/O size and alignment over the
+	 * extents returned by xfs_bmapi().
+	 */
+	xfs_write_bmap(mp, io, imap, pbmapp, iosize, ioalign, isize);
+	pbmapp->pbm_delta = offset - pbmapp->pbm_offset;
+
+	ASSERT((pbmapp->pbm_bsize > 0)
+		&& (pbmapp->pbm_bsize - pbmapp->pbm_delta > 0));
+
+	/*
+	 * A bmap is the EOF bmap when it reaches to or beyond the new
+	 * inode size.
+	 */
+	if ((pbmapp->pbm_offset + pbmapp->pbm_bsize ) >= isize) {
+		pbmapp->pbm_flags |= PBMF_EOF;
+	}
+
+/*	xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP,
+			    io, offset, count, bmapp, imap);	     */
+
+	/* On IRIX, we walk more imaps filling in more bmaps. On Linux
+		just handle one for now. To find the code on IRIX,
+		look in xfs_iomap_write() in xfs_rw.c. */
+
+	*npbmaps = 1;
+	return 0;
+}
+
+STATIC int
+xfs_iomap_write_direct(
+	xfs_iocore_t	*io,
+	loff_t		offset,
+	size_t		count,
+	pb_bmap_t	*pbmapp,
+	int		*npbmaps,
+	int		ioflag,
+	int		found)
+{
+	xfs_inode_t	*ip = XFS_IO_INODE(io);
+	xfs_mount_t	*mp;
+	xfs_fileoff_t	offset_fsb;
+	xfs_fileoff_t	last_fsb;
+	xfs_filblks_t	count_fsb;
+	xfs_fsize_t	isize;
+	xfs_fsblock_t	firstfsb;
+	int		nimaps, maps;
+	int		error;
+	xfs_trans_t	*tp;
+
+#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
+	xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS], *imapp;
+	xfs_bmap_free_t free_list;
+	int		aeof;
+	int		bmapi_flags;
+	xfs_filblks_t	datablocks;
+	int		rt;
+	int		committed;
+	int		numrtextents;
+	uint		resblks;
+	int		rtextsize;
+
+	maps = min(XFS_WRITE_IMAPS, *npbmaps);
+	nimaps = maps;
+
+	mp = io->io_mount;
+	isize = XFS_SIZE(mp, io);
+	if (io->io_new_size > isize)
+		isize = io->io_new_size;
+
+	if ((offset + count) > isize) {
+		aeof = 1;
+	} else {
+		aeof = 0;
+	}
+
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
+	count_fsb = last_fsb - offset_fsb;
+	if (found && (pbmapp->pbm_flags & PBMF_HOLE)) {
+		xfs_fileoff_t	map_last_fsb;
+		map_last_fsb = XFS_B_TO_FSB(mp,
+			(pbmapp->pbm_bsize + pbmapp->pbm_offset));
+
+		if (map_last_fsb < last_fsb) {
+			last_fsb = map_last_fsb;
+			count_fsb = last_fsb - offset_fsb;
+		}
+		ASSERT(count_fsb > 0);
+	}
+
+	/*
+	 * roundup the allocation request to m_dalign boundary if file size
+	 * is greater that 512K and we are allocating past the allocation eof
+	 */
+	if (!found && mp->m_dalign && (isize >= 524288) && aeof) {
+		int eof;
+		xfs_fileoff_t new_last_fsb;
+		new_last_fsb = roundup_64(last_fsb, mp->m_dalign);
+		printk("xfs_iomap_write_direct: about to XFS_BMAP_EOF %Ld\n",
+			new_last_fsb);
+		error = XFS_BMAP_EOF(mp, io, new_last_fsb, XFS_DATA_FORK, &eof);
+		if (error) {
+			goto error_out;
+		}
+		if (eof)
+			last_fsb = new_last_fsb;
+	}
+
+	bmapi_flags = XFS_BMAPI_WRITE|XFS_BMAPI_DIRECT_IO|XFS_BMAPI_ENTIRE;
+	bmapi_flags &= ~XFS_BMAPI_DIRECT_IO;
+
+	/*
+	 * determine if this is a realtime file
+	 */
+	if ((rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) != 0) {
+		rtextsize = mp->m_sb.sb_rextsize;
+	} else
+		rtextsize = 0;
+
+	error = 0;
+
+	/*
+	 * allocate file space for the bmapp entries passed in.
+	 */
+
+	/*
+	 * determine if reserving space on
+	 * the data or realtime partition.
+	 */
+	if (rt) {
+		numrtextents = (count_fsb + rtextsize - 1);
+		do_div(numrtextents, rtextsize);
+		datablocks = 0;
+	} else {
+		datablocks = count_fsb;
+		numrtextents = 0;
+	}
+
+	/*
+	 * allocate and setup the transaction
+	 */
+	tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+	resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks);
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	error = xfs_trans_reserve(tp,
+				  resblks,
+				  XFS_WRITE_LOG_RES(mp),
+				  numrtextents,
+				  XFS_TRANS_PERM_LOG_RES,
+				  XFS_WRITE_LOG_COUNT);
+
+	/*
+	 * check for running out of space
+	 */
+	if (error) {
+		/*
+		 * Free the transaction structure.
+		 */
+		xfs_trans_cancel(tp, 0);
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	if (error)  {
+		goto error_out; /* Don't return in above if .. trans ..,
+					need lock to return */
+	}
+
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if (xfs_trans_reserve_quota(tp,
+					    ip->i_udquot,
+					    ip->i_gdquot,
+					    resblks, 0, 0)) {
+			error = (EDQUOT);
+			goto error1;
+		}
+		nimaps = 1;
+	} else {
+		nimaps = 2;
+	}
+
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ihold(tp, ip);
+
+	/*
+	 * issue the bmapi() call to allocate the blocks
+	 */
+	XFS_BMAP_INIT(&free_list, &firstfsb);
+	imapp = &imap[0];
+	error = XFS_BMAPI(mp, tp, io, offset_fsb, count_fsb,
+		bmapi_flags, &firstfsb, 1, imapp, &nimaps, &free_list);
+	if (error) {
+		goto error0;
+	}
+
+	/*
+	 * complete the transaction
+	 */
+
+	error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
+	if (error) {
+		goto error0;
+	}
+
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	if (error) {
+		goto error_out;
+	}
+
+	/* copy any maps to caller's array and return any error. */
+	if (nimaps == 0) {
+		error = (ENOSPC);
+		goto error_out;
+	}
+
+	maps = min(nimaps, maps);
+	*npbmaps = _xfs_imap_to_bmap(io, offset, &imap[0], pbmapp, maps,
+								*npbmaps);
+	if(*npbmaps) {
+		/*
+		 * this is new since xfs_iomap_read
+		 * didn't find it.
+		 */
+		if (*npbmaps != 1) {
+			printk("NEED MORE WORK FOR MULTIPLE BMAPS (which are new)\n");
+		}
+	}
+	goto out;
+
+ error0:	/* Cancel bmap, unlock inode, and cancel trans */
+	xfs_bmap_cancel(&free_list);
+
+ error1:	/* Just cancel transaction */
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	*npbmaps = 0;	/* nothing set-up here */
+
+error_out:
+out:	/* Just return error and any tracing at end of routine */
+	return XFS_ERROR(error);
+}
+
+
+/*
+ * All xfs metadata buffers except log state machine buffers
+ * get this attached as their b_bdstrat callback function.
+ * This is so that we can catch a buffer
+ * after prematurely unpinning it to forcibly shutdown the filesystem.
+ */
+int
+xfs_bdstrat_cb(struct xfs_buf *bp)
+{
+	xfs_mount_t	*mp;
+
+	mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
+	if (!XFS_FORCED_SHUTDOWN(mp)) {
+		pagebuf_iorequest(bp);
+		return 0;
+	} else {
+		xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
+		/*
+		 * Metadata write that didn't get logged but
+		 * written delayed anyway. These aren't associated
+		 * with a transaction, and can be ignored.
+		 */
+		if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
+		    (XFS_BUF_ISREAD(bp)) == 0)
+			return (xfs_bioerror_relse(bp));
+		else
+			return (xfs_bioerror(bp));
+	}
+}
+/*
+ * Wrapper around bdstrat so that we can stop data
+ * from going to disk in case we are shutting down the filesystem.
+ * Typically user data goes thru this path; one of the exceptions
+ * is the superblock.
+ */
+int
+xfsbdstrat(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp)
+{
+	ASSERT(mp);
+	if (!XFS_FORCED_SHUTDOWN(mp)) {
+		/* Grio redirection would go here
+		 * if (XFS_BUF_IS_GRIO(bp)) {
+		 */
+
+		pagebuf_iorequest(bp);
+		return 0;
+	}
+
+	xfs_buftrace("XFSBDSTRAT IOERROR", bp);
+	return (xfs_bioerror_relse(bp));
+}
+
+
+void
+XFS_bflush(xfs_buftarg_t *target)
+{
+	pagebuf_delwri_flush(target, PBDF_WAIT, NULL);
+}
+
+
+/* Push all fs state out to disk
+ */
+
+void
+XFS_log_write_unmount_ro(bhv_desc_t	*bdp)
+{
+	xfs_mount_t	*mp;
+	int pincount = 0;
+	int count = 0;
+	int error;
+
+	mp = XFS_BHVTOM(bdp);
+	xfs_refcache_purge_mp(mp);
+	xfs_binval(mp->m_ddev_targp);
+
+	do {
+		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+		VFS_SYNC(XFS_MTOVFS(mp), SYNC_ATTR|SYNC_WAIT, NULL, error);
+		pagebuf_delwri_flush(mp->m_ddev_targp,
+				PBDF_WAIT, &pincount);
+		if (pincount == 0) {delay(50); count++;}
+	}  while (count < 2);
+
+	/* Ok now write out an unmount record */
+	xfs_log_unmount_write(mp);
+	xfs_unmountfs_writesb(mp);
+}
+
+/*
+ * In these two situations we disregard the readonly mount flag and
+ * temporarily enable writes (we must, to ensure metadata integrity).
+ */
+STATIC int
+xfs_is_read_only(xfs_mount_t *mp)
+{
+	if (bdev_read_only(mp->m_ddev_targp->pbr_bdev) ||
+	    bdev_read_only(mp->m_logdev_targp->pbr_bdev)) {
+		cmn_err(CE_NOTE,
+			"XFS: write access unavailable, cannot proceed.");
+		return EROFS;
+	}
+	cmn_err(CE_NOTE,
+		"XFS: write access will be enabled during mount.");
+	XFS_MTOVFS(mp)->vfs_flag &= ~VFS_RDONLY;
+	return 0;
+}
+
+int
+xfs_recover_read_only(xlog_t *log)
+{
+	cmn_err(CE_NOTE, "XFS: WARNING: "
+		"recovery required on readonly filesystem.");
+	return xfs_is_read_only(log->l_mp);
+}
+
+int
+xfs_quotacheck_read_only(xfs_mount_t *mp)
+{
+	cmn_err(CE_NOTE, "XFS: WARNING: "
+		"quotacheck required on readonly filesystem.");
+	return xfs_is_read_only(mp);
+}
diff --git a/fs/xfs/linux/xfs_lrw.h b/fs/xfs/linux/xfs_lrw.h
new file mode 100644
index 000000000000..12bdf990ac16
--- /dev/null
+++ b/fs/xfs/linux/xfs_lrw.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#ifndef __XFS_LRW_H__
+#define __XFS_LRW_H__
+
+#define XFS_IOMAP_READ_ENTER	3
+/*
+ * Maximum count of bmaps used by read and write paths.
+ */
+#define XFS_MAX_RW_NBMAPS	4
+
+extern int xfs_bmap (bhv_desc_t *, xfs_off_t, ssize_t, int, struct cred *, pb_bmap_t *, int *);
+extern int xfs_strategy (bhv_desc_t *, xfs_off_t, ssize_t, int, struct cred *, pb_bmap_t *, int *);
+extern int xfsbdstrat (struct xfs_mount *, struct xfs_buf *);
+extern int xfs_bdstrat_cb (struct xfs_buf *);
+
+extern int xfs_zero_eof (vnode_t *, struct xfs_iocore *, xfs_off_t,
+				xfs_fsize_t, xfs_fsize_t, struct pm *);
+extern ssize_t xfs_read (
+	struct bhv_desc	       *bdp,
+	struct file		*file,
+	char			*buf,
+	size_t			size,
+	loff_t			*offset,
+	struct cred	       *credp);
+
+extern ssize_t xfs_write (
+	struct bhv_desc		*bdp,
+	struct file		*file,
+	const char		*buf,
+	size_t			size,
+	loff_t			*offset,
+	struct cred		*credp);
+
+extern int xfs_recover_read_only (xlog_t *);
+extern int xfs_quotacheck_read_only (xfs_mount_t *);
+
+extern void XFS_log_write_unmount_ro (bhv_desc_t *);
+
+#define XFS_FSB_TO_DB_IO(io,fsb) \
+		(((io)->io_flags & XFS_IOCORE_RT) ? \
+		 XFS_FSB_TO_BB((io)->io_mount, (fsb)) : \
+		 XFS_FSB_TO_DADDR((io)->io_mount, (fsb)))
+
+#endif	/* __XFS_LRW_H__ */
diff --git a/fs/xfs/linux/xfs_stats.c b/fs/xfs/linux/xfs_stats.c
new file mode 100644
index 000000000000..ef67268eb8e8
--- /dev/null
+++ b/fs/xfs/linux/xfs_stats.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <linux/proc_fs.h>
+
+static int
+xfs_read_xfsstats(char *buffer, char **start, off_t offset,
+			int count, int *eof, void *data)
+{
+	int	i, j, len;
+	static struct xstats_entry {
+		char	*desc;
+		int	endpoint;
+	} xstats[] = {
+		{ "extent_alloc",	XFSSTAT_END_EXTENT_ALLOC	},
+		{ "abt",		XFSSTAT_END_ALLOC_BTREE		},
+		{ "blk_map",		XFSSTAT_END_BLOCK_MAPPING	},
+		{ "bmbt",		XFSSTAT_END_BLOCK_MAP_BTREE	},
+		{ "dir",		XFSSTAT_END_DIRECTORY_OPS	},
+		{ "trans",		XFSSTAT_END_TRANSACTIONS	},
+		{ "ig",			XFSSTAT_END_INODE_OPS		},
+		{ "log",		XFSSTAT_END_LOG_OPS		},
+		{ "push_ail",		XFSSTAT_END_TAIL_PUSHING	},
+		{ "xstrat",		XFSSTAT_END_WRITE_CONVERT	},
+		{ "rw",			XFSSTAT_END_READ_WRITE_OPS	},
+		{ "attr",		XFSSTAT_END_ATTRIBUTE_OPS	},
+		{ "qm",			XFSSTAT_END_QUOTA_OPS		},
+		{ "icluster",		XFSSTAT_END_INODE_CLUSTER	},
+		{ "vnodes",		XFSSTAT_END_VNODE_OPS		},
+	};
+
+	for (i=j=len = 0; i < sizeof(xstats)/sizeof(struct xstats_entry); i++) {
+		len += sprintf(buffer + len, xstats[i].desc);
+		/* inner loop does each group */
+		while (j < xstats[i].endpoint) {
+			len += sprintf(buffer + len, " %u",
+					*(((__u32*)&xfsstats) + j));
+			j++;
+		}
+		buffer[len++] = '\n';
+	}
+	/* extra precision counters */
+	len += sprintf(buffer + len, "xpc %Lu %Lu %Lu\n",
+			xfsstats.xs_xstrat_bytes,
+			xfsstats.xs_write_bytes,
+			xfsstats.xs_read_bytes);
+
+	if (offset >= len) {
+		*start = buffer;
+		*eof = 1;
+		return 0;
+	}
+	*start = buffer + offset;
+	if ((len -= offset) > count)
+		return count;
+	*eof = 1;
+
+	return len;
+}
+
+static int
+xfs_read_xfsquota(char *buffer, char **start, off_t offset,
+			int count, int *eof, void *data)
+{
+	int	len;
+
+	/* maximum; incore; ratio free to inuse; freelist */
+	len = sprintf(buffer, "%d\t%d\t%d\t%u\n",
+			ndquot,
+			xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
+			xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
+			xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0);
+
+	if (offset >= len) {
+		*start = buffer;
+		*eof = 1;
+		return 0;
+	}
+	*start = buffer + offset;
+	if ((len -= offset) > count)
+		return count;
+	*eof = 1;
+
+	return len;
+}
+
+void
+xfs_init_procfs(void)
+{
+	if (!proc_mkdir("fs/xfs", 0))
+		return;
+	create_proc_read_entry("fs/xfs/stat", 0, 0, xfs_read_xfsstats, NULL);
+	create_proc_read_entry("fs/xfs/xqm", 0, 0, xfs_read_xfsquota, NULL);
+}
+
+void
+xfs_cleanup_procfs(void)
+{
+	remove_proc_entry("fs/xfs/stat", NULL);
+	remove_proc_entry("fs/xfs/xqm", NULL);
+	remove_proc_entry("fs/xfs", NULL);
+}
diff --git a/fs/xfs/linux/xfs_stats.h b/fs/xfs/linux/xfs_stats.h
new file mode 100644
index 000000000000..50f64106695e
--- /dev/null
+++ b/fs/xfs/linux/xfs_stats.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_STATS_H__
+#define __XFS_STATS_H__
+
+/*
+ * procfs interface
+ */
+#ifdef CONFIG_PROC_FS
+extern void xfs_init_procfs(void);
+extern void xfs_cleanup_procfs(void);
+#else
+static __inline void xfs_init_procfs(void) { };
+static __inline void xfs_cleanup_procfs(void) { };
+#endif
+
+#endif /* __XFS_STATS_H__ */
diff --git a/fs/xfs/linux/xfs_super.c b/fs/xfs/linux/xfs_super.c
new file mode 100644
index 000000000000..a23227d6dbbd
--- /dev/null
+++ b/fs/xfs/linux/xfs_super.c
@@ -0,0 +1,888 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <linux/namei.h>
+#include <linux/init.h>
+#include <linux/ctype.h>
+#include <linux/seq_file.h>
+#include "xfs_version.h"
+
+/* xfs_vfs[ops].c */
+extern int  xfs_init(void);
+extern void xfs_cleanup(void);
+
+/* For kernels which have the s_maxbytes field - set it */
+#ifdef MAX_NON_LFS
+# define set_max_bytes(sb)	((sb)->s_maxbytes = XFS_MAX_FILE_OFFSET)
+#else
+# define set_max_bytes(sb)	do { } while (0)
+#endif
+
+#ifdef CONFIG_FS_POSIX_ACL
+# define set_posix_acl(sb)	((sb)->s_flags |= MS_POSIXACL)
+#else
+# define set_posix_acl(sb)	do { } while (0)
+#endif
+
+#ifdef CONFIG_XFS_QUOTA
+static struct quotactl_ops linvfs_qops = {
+	.get_xstate		= linvfs_getxstate,
+	.set_xstate		= linvfs_setxstate,
+	.get_xquota		= linvfs_getxquota,
+	.set_xquota		= linvfs_setxquota,
+};
+# define set_quota_ops(sb)	((sb)->s_qcop = &linvfs_qops)
+#else
+# define set_quota_ops(sb)	do { } while (0)
+#endif
+
+#ifdef CONFIG_XFS_DMAPI
+int dmapi_init(void);
+void dmapi_uninit(void);
+#else
+#define dmapi_init()
+#define dmapi_uninit()
+#endif
+
+static struct super_operations linvfs_sops;
+static struct export_operations linvfs_export_ops;
+
+#define MNTOPT_LOGBUFS	"logbufs"	/* number of XFS log buffers */
+#define MNTOPT_LOGBSIZE "logbsize"	/* size of XFS log buffers */
+#define MNTOPT_LOGDEV	"logdev"	/* log device */
+#define MNTOPT_RTDEV	"rtdev"		/* realtime I/O device */
+#define MNTOPT_DMAPI	"dmapi"		/* DMI enabled (DMAPI / XDSM) */
+#define MNTOPT_XDSM	"xdsm"		/* DMI enabled (DMAPI / XDSM) */
+#define MNTOPT_BIOSIZE	"biosize"	/* log2 of preferred buffered io size */
+#define MNTOPT_WSYNC	"wsync"		/* safe-mode nfs compatible mount */
+#define MNTOPT_INO64	"ino64"		/* force inodes into 64-bit range */
+#define MNTOPT_NOALIGN	"noalign"	/* turn off stripe alignment */
+#define MNTOPT_SUNIT	"sunit"		/* data volume stripe unit */
+#define MNTOPT_SWIDTH	"swidth"	/* data volume stripe width */
+#define MNTOPT_NORECOVERY "norecovery"	/* don't run XFS recovery */
+#define MNTOPT_OSYNCISDSYNC "osyncisdsync" /* o_sync == o_dsync on this fs */
+					   /* (this is now the default!) */
+#define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */
+#define MNTOPT_QUOTA	"quota"		/* disk quotas */
+#define MNTOPT_MRQUOTA	"mrquota"	/* don't turnoff if SB has quotas on */
+#define MNTOPT_NOQUOTA	"noquota"	/* no quotas */
+#define MNTOPT_UQUOTA	"usrquota"	/* user quota enabled */
+#define MNTOPT_GQUOTA	"grpquota"	/* group quota enabled */
+#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */
+#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
+#define MNTOPT_QUOTANOENF  "qnoenforce" /* same as uqnoenforce */
+#define MNTOPT_NOUUID	"nouuid"	/* Ignore FS uuid */
+#define MNTOPT_IRIXSGID "irixsgid"	/* Irix-style sgid inheritance */
+#define MNTOPT_NOLOGFLUSH  "nologflush"	/* Don't use hard flushes in
+					   log writing */
+#define MNTOPT_MTPT	"mtpt"		/* filesystem mount point */
+
+STATIC int
+xfs_parseargs(
+	char		*options,
+	int		flags,
+	struct xfs_mount_args *args)
+{
+	char		*this_char, *value, *eov;
+	int		logbufs = -1;
+	int		logbufsize = -1;
+	int		dsunit, dswidth, vol_dsunit, vol_dswidth;
+	int		iosize;
+	int		rval = 1;	/* failure is default */
+
+	iosize = dsunit = dswidth = vol_dsunit = vol_dswidth = 0;
+	memset(args, 0, sizeof(struct xfs_mount_args));
+	args->slcount = args->stimeout = args->ctimeout = -1;
+	args->mtpt[0] = '\0';
+
+	/* Copy the already-parsed mount(2) flags we're interested in */
+	if (flags & MS_NOATIME)
+		args->flags |= XFSMNT_NOATIME;
+
+	if (!options) {
+		args->logbufs = logbufs;
+		args->logbufsize = logbufsize;
+		return 0;
+	}
+
+	while ((this_char = strsep (&options, ",")) != NULL) {
+		if (!*this_char)
+			continue;
+		if ((value = strchr (this_char, '=')) != NULL)
+			*value++ = 0;
+
+		if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
+			logbufs = simple_strtoul(value, &eov, 10);
+		} else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
+			int in_kilobytes = 0;
+
+			if (toupper(value[strlen(value)-1]) == 'K') {
+				in_kilobytes = 1;
+				value[strlen(value)-1] = '\0';
+			}
+			logbufsize = simple_strtoul(value, &eov, 10);
+			if (in_kilobytes)
+				logbufsize = logbufsize * 1024;
+		} else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
+			strncpy(args->logname, value, MAXNAMELEN);
+		} else if (!strcmp(this_char, MNTOPT_MTPT)) {
+			strncpy(args->mtpt, value, MAXNAMELEN);
+#if CONFIG_XFS_DMAPI
+		} else if (!strcmp(this_char, MNTOPT_DMAPI)) {
+			args->flags |= XFSMNT_DMAPI;
+		} else if (!strcmp(this_char, MNTOPT_XDSM)) {
+			args->flags |= XFSMNT_DMAPI;
+#else
+		} else if (!strcmp(this_char, MNTOPT_DMAPI) ||
+			   !strcmp(this_char, MNTOPT_XDSM)) {
+			printk("XFS: this kernel does not support dmapi/xdsm.\n");
+			return rval;
+#endif
+		} else if (!strcmp(this_char, MNTOPT_RTDEV)) {
+			strncpy(args->rtname, value, MAXNAMELEN);
+		} else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
+			iosize = simple_strtoul(value, &eov, 10);
+			args->flags |= XFSMNT_IOSIZE;
+			args->iosizelog = (uint8_t) iosize;
+		} else if (!strcmp(this_char, MNTOPT_WSYNC)) {
+			args->flags |= XFSMNT_WSYNC;
+		} else if (!strcmp(this_char, MNTOPT_OSYNCISDSYNC)) {
+			/* no-op, this is now the default */
+printk("XFS: osyncisdsync is now the default, and will soon be deprecated.\n");
+		} else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) {
+			args->flags |= XFSMNT_OSYNCISOSYNC;
+		} else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
+			args->flags |= XFSMNT_NORECOVERY;
+		} else if (!strcmp(this_char, MNTOPT_INO64)) {
+#ifdef XFS_BIG_FILESYSTEMS
+			args->flags |= XFSMNT_INO64;
+#else
+			printk("XFS: ino64 option not allowed on this system\n");
+			return rval;
+#endif
+		} else if (!strcmp(this_char, MNTOPT_UQUOTA)) {
+			args->flags |= XFSMNT_UQUOTA | XFSMNT_UQUOTAENF;
+		} else if (!strcmp(this_char, MNTOPT_QUOTA)) {
+			args->flags |= XFSMNT_UQUOTA | XFSMNT_UQUOTAENF;
+		} else if (!strcmp(this_char, MNTOPT_UQUOTANOENF)) {
+			args->flags |= XFSMNT_UQUOTA;
+			args->flags &= ~XFSMNT_UQUOTAENF;
+		} else if (!strcmp(this_char, MNTOPT_QUOTANOENF)) {
+			args->flags |= XFSMNT_UQUOTA;
+			args->flags &= ~XFSMNT_UQUOTAENF;
+		} else if (!strcmp(this_char, MNTOPT_GQUOTA)) {
+			args->flags |= XFSMNT_GQUOTA | XFSMNT_GQUOTAENF;
+		} else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
+			args->flags |= XFSMNT_GQUOTA;
+			args->flags &= ~XFSMNT_GQUOTAENF;
+		} else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
+			args->flags |= XFSMNT_NOALIGN;
+		} else if (!strcmp(this_char, MNTOPT_SUNIT)) {
+			dsunit = simple_strtoul(value, &eov, 10);
+		} else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
+			dswidth = simple_strtoul(value, &eov, 10);
+		} else if (!strcmp(this_char, MNTOPT_NOUUID)) {
+			args->flags |= XFSMNT_NOUUID;
+		} else if (!strcmp(this_char, MNTOPT_IRIXSGID)) {
+			args->flags |= XFSMNT_IRIXSGID;
+		} else if (!strcmp(this_char, MNTOPT_NOLOGFLUSH)) {
+			args->flags |= XFSMNT_NOLOGFLUSH;
+		} else {
+			printk("XFS: unknown mount option [%s].\n", this_char);
+			return rval;
+		}
+	}
+
+	if (args->flags & XFSMNT_NORECOVERY) {
+		if ((flags & MS_RDONLY) == 0) {
+			printk("XFS: no-recovery mounts must be read-only.\n");
+			return rval;
+		}
+	}
+
+	if ((args->flags & XFSMNT_NOALIGN) && (dsunit || dswidth)) {
+		printk(
+	"XFS: sunit and swidth options incompatible with the noalign option\n");
+		return rval;
+	}
+
+	if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
+		printk("XFS: sunit and swidth must be specified together\n");
+		return rval;
+	}
+
+	if (dsunit && (dswidth % dsunit != 0)) {
+		printk(
+	"XFS: stripe width (%d) must be a multiple of the stripe unit (%d)\n",
+			dswidth, dsunit);
+		return rval;
+	}
+
+	if ((args->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
+		if (dsunit) {
+			args->sunit = dsunit;
+			args->flags |= XFSMNT_RETERR;
+		} else
+			args->sunit = vol_dsunit;
+		dswidth ? (args->swidth = dswidth) :
+			  (args->swidth = vol_dswidth);
+	} else
+		args->sunit = args->swidth = 0;
+
+	args->logbufs = logbufs;
+	args->logbufsize = logbufsize;
+
+	return 0;
+}
+
+/*
+ * Convert one device special file to a dev_t.
+ * Helper routine, used only by spectodevs below.
+ */
+STATIC int
+spectodev(
+	const char	*name,
+	const char	*id,
+	dev_t		*dev)
+{
+	struct nameidata nd;
+	int		error;
+
+	error = path_lookup(name, LOOKUP_FOLLOW, &nd);
+	if (error)
+		return error;
+
+	*dev = kdev_t_to_nr(nd.dentry->d_inode->i_rdev);
+	path_release(&nd);
+	return 0;
+}
+
+/*
+ * Convert device special files to dev_t for data, log, realtime.
+ */
+int
+spectodevs(
+	struct super_block *sb,
+	struct xfs_mount_args *args,
+	dev_t		*ddevp,
+	dev_t		*logdevp,
+	dev_t		*rtdevp)
+{
+	int		rval = 0;
+
+	*ddevp = sb->s_dev;
+
+	if (args->logname[0])
+		rval = spectodev(args->logname, "log", logdevp);
+	else
+		*logdevp = sb->s_dev;
+
+	if (args->rtname[0] && !rval)
+		rval = spectodev(args->rtname, "realtime", rtdevp);
+	else
+		*rtdevp = 0;
+	return rval;
+}
+
+
+static kmem_cache_t * linvfs_inode_cachep;
+
+static __inline__ unsigned int gfp_mask(void)
+{
+	/* If we're not in a transaction, FS activity is ok */
+	if (current->flags & PF_FSTRANS) return GFP_NOFS;
+	return GFP_KERNEL;
+}
+
+
+static struct inode *linvfs_alloc_inode(struct super_block *sb)
+{
+	vnode_t *vp;
+
+	vp = (vnode_t *)kmem_cache_alloc(linvfs_inode_cachep, gfp_mask());
+	if (!vp)
+		return NULL;
+	return LINVFS_GET_IP(vp);
+}
+
+static void linvfs_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(linvfs_inode_cachep, LINVFS_GET_VP(inode));
+}
+
+static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+{
+	vnode_t *vp = (vnode_t *)foo;
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR)
+		inode_init_once(LINVFS_GET_IP(vp));
+}
+
+static int init_inodecache(void)
+{
+	linvfs_inode_cachep = kmem_cache_create("linvfs_icache",
+				sizeof(vnode_t), 0, SLAB_HWCACHE_ALIGN,
+				init_once, NULL);
+
+	if (linvfs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	if (kmem_cache_destroy(linvfs_inode_cachep))
+		printk(KERN_INFO "linvfs_inode_cache: not all structures were freed\n");
+}
+
+static int
+linvfs_fill_super(
+	struct super_block *sb,
+	void		*data,
+	int		silent)
+{
+	vfs_t		*vfsp;
+	vfsops_t	*vfsops;
+	vnode_t		*rootvp;
+	struct inode	*ip;
+	struct xfs_mount_args *args;
+	struct statfs	statvfs;
+	int		error = EINVAL;
+
+	args = (struct xfs_mount_args *)kmalloc(sizeof(struct xfs_mount_args), GFP_KERNEL);
+	if (!args)
+		return	-EINVAL;
+	strncpy(args->fsname, sb->s_id, MAXNAMELEN);
+
+	if (xfs_parseargs((char *)data, sb->s_flags, args)) {
+		goto out_null;
+	}
+	strncpy(args->fsname, bdevname(sb->s_bdev), MAXNAMELEN);
+	/* args->rtdev and args->logdev done in xfs_parseargs */
+
+	/*  Kludge in XFS until we have other VFS/VNODE FSs  */
+	vfsops = &xfs_vfsops;
+
+	/*  Set up the vfs_t structure	*/
+	vfsp = vfs_allocate();
+	if (!vfsp) {
+		error = ENOMEM;
+		goto out_null;
+	}
+
+	if (sb->s_flags & MS_RDONLY)
+		vfsp->vfs_flag |= VFS_RDONLY;
+
+	vfsp->vfs_super = sb;
+	set_blocksize(sb->s_bdev, BBSIZE);
+	set_max_bytes(sb);
+	set_quota_ops(sb);
+	sb->s_op = &linvfs_sops;
+	sb->s_export_op = &linvfs_export_ops;
+
+	LINVFS_SET_VFS(sb, vfsp);
+
+	VFSOPS_MOUNT(vfsops, vfsp, args, NULL, error);
+	if (error)
+		goto fail_vfsop;
+
+	VFS_STATVFS(vfsp, &statvfs, NULL, error);
+	if (error)
+		goto fail_unmount;
+
+	sb->s_magic = XFS_SB_MAGIC;
+	sb->s_dirt = 1;
+	sb->s_blocksize = statvfs.f_bsize;
+	sb->s_blocksize_bits = ffs(statvfs.f_bsize) - 1;
+
+	VFS_ROOT(vfsp, &rootvp, error);
+	if (error)
+		goto fail_unmount;
+
+	ip = LINVFS_GET_IP(rootvp);
+	linvfs_revalidate_core(ip, ATTR_COMM);
+
+	sb->s_root = d_alloc_root(ip);
+	if (!sb->s_root)
+		goto fail_vnrele;
+	if (is_bad_inode(sb->s_root->d_inode))
+		goto fail_vnrele;
+
+	/* Don't set the VFS_DMI flag until here because we don't want
+	 * to send events while replaying the log.
+	 */
+	if (args->flags & XFSMNT_DMAPI) {
+		vfsp->vfs_flag |= VFS_DMI;
+		VFSOPS_DMAPI_MOUNT(vfsops, vfsp, args->mtpt, args->fsname,
+				   error);
+
+		if (error) {
+			if (atomic_read(&sb->s_active) == 1)
+				vfsp->vfs_flag &= ~VFS_DMI;
+			goto fail_vnrele;
+		}
+	}
+	set_posix_acl(sb);
+
+	vn_trace_exit(rootvp, "linvfs_read_super", (inst_t *)__return_address);
+
+	kfree(args);
+	return 0;
+
+fail_vnrele:
+	if (sb->s_root) {
+		dput(sb->s_root);
+		sb->s_root = NULL;
+	} else {
+		VN_RELE(rootvp);
+	}
+
+fail_unmount:
+	VFS_UNMOUNT(vfsp, 0, NULL, error);
+
+fail_vfsop:
+	vfs_deallocate(vfsp);
+
+out_null:
+	kfree(args);
+	return -error;
+}
+
+void
+linvfs_set_inode_ops(
+	struct inode	*inode)
+{
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+
+	inode->i_mode = VTTOIF(vp->v_type);
+
+	/* If this isn't a new inode, nothing to do */
+	if (!(inode->i_state & I_NEW))
+		return;
+
+	if (vp->v_type == VNON) {
+		make_bad_inode(inode);
+	} else if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &linvfs_file_inode_operations;
+		inode->i_fop = &linvfs_file_operations;
+		inode->i_mapping->a_ops = &linvfs_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &linvfs_dir_inode_operations;
+		inode->i_fop = &linvfs_dir_operations;
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &linvfs_symlink_inode_operations;
+		if (inode->i_blocks)
+			inode->i_mapping->a_ops = &linvfs_aops;
+	} else {
+		inode->i_op = &linvfs_file_inode_operations;
+		init_special_inode(inode, inode->i_mode,
+					kdev_t_to_nr(inode->i_rdev));
+	}
+
+	unlock_new_inode(inode);
+}
+
+/*
+ * We do not actually write the inode here, just mark the
+ * super block dirty so that sync_supers calls us and
+ * forces the flush.
+ */
+void
+linvfs_write_inode(
+	struct inode	*inode,
+	int		sync)
+{
+	vnode_t *vp = LINVFS_GET_VP(inode);
+	int	error, flags = FLUSH_INODE;
+
+	if (vp) {
+		vn_trace_entry(vp, "linvfs_write_inode",
+					(inst_t *)__return_address);
+
+		if (sync)
+			flags |= FLUSH_SYNC;
+		VOP_IFLUSH(vp, flags, error);
+		if (error == EAGAIN)
+			inode->i_sb->s_dirt = 1;
+	}
+}
+
+void
+linvfs_clear_inode(
+	struct inode	*inode)
+{
+	vnode_t		*vp = LINVFS_GET_VP(inode);
+
+	if (vp) {
+		vn_rele(vp);
+		vn_trace_entry(vp, "linvfs_clear_inode",
+					(inst_t *)__return_address);
+		/*
+		 * Do all our cleanup, and remove this vnode.
+		 */
+		vp->v_flag |= VPURGE;
+		vn_remove(vp);
+	}
+}
+
+void
+linvfs_put_inode(
+	struct inode	*ip)
+{
+	vnode_t		*vp = LINVFS_GET_VP(ip);
+	int		error;
+
+	if (vp && vp->v_fbhv && (atomic_read(&ip->i_count) == 1))
+		VOP_RELEASE(vp, error);
+}
+
+void
+linvfs_put_super(
+	struct super_block *sb)
+{
+	int		error;
+	int		sector_size;
+	vfs_t		*vfsp = LINVFS_GET_VFS(sb);
+
+	VFS_DOUNMOUNT(vfsp, 0, NULL, NULL, error);
+	if (error) {
+		printk("XFS unmount got error %d\n", error);
+		printk("linvfs_put_super: vfsp/0x%p left dangling!\n", vfsp);
+		return;
+	}
+
+	vfs_deallocate(vfsp);
+
+	/* Reset device block size */
+	sector_size = bdev_hardsect_size(sb->s_bdev);
+	set_blocksize(sb->s_bdev, sector_size);
+}
+
+void
+linvfs_write_super(
+	struct super_block *sb)
+{
+	vfs_t		*vfsp = LINVFS_GET_VFS(sb);
+	int		error;
+
+	sb->s_dirt = 0;
+	if (sb->s_flags & MS_RDONLY)
+		return;
+	VFS_SYNC(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR,
+		NULL, error);
+}
+
+int
+linvfs_statfs(
+	struct super_block *sb,
+	struct statfs	*statp)
+{
+	vfs_t		*vfsp = LINVFS_GET_VFS(sb);
+	int		error;
+
+	VFS_STATVFS(vfsp, statp, NULL, error);
+
+	return error;
+}
+
+int
+linvfs_remount(
+	struct super_block *sb,
+	int		*flags,
+	char		*options)
+{
+	struct xfs_mount_args *args;
+	vfs_t		*vfsp;
+	xfs_mount_t	*mp;
+	int		error = 0;
+
+	args = (struct xfs_mount_args *)kmalloc(sizeof(struct xfs_mount_args), GFP_KERNEL);
+	if (!args)
+		return -ENOMEM;
+
+	vfsp = LINVFS_GET_VFS(sb);
+	mp = XFS_BHVTOM(vfsp->vfs_fbhv);
+
+	if (xfs_parseargs(options, *flags, args)) {
+		error = -EINVAL;
+		goto out;
+	}
+	set_posix_acl(sb);
+
+	if (args->flags & XFSMNT_NOATIME)
+		mp->m_flags |= XFS_MOUNT_NOATIME;
+	else
+		mp->m_flags &= ~XFS_MOUNT_NOATIME;
+
+	linvfs_write_super(sb);
+
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+		goto out;
+
+	if (*flags & MS_RDONLY) {
+		sb->s_flags |= MS_RDONLY;
+		XFS_log_write_unmount_ro(vfsp->vfs_fbhv);
+		vfsp->vfs_flag |= VFS_RDONLY;
+	} else {
+		vfsp->vfs_flag &= ~VFS_RDONLY;
+	}
+
+out:
+	kfree(args);
+	return error;
+}
+
+void
+linvfs_freeze_fs(
+	struct super_block *sb)
+{
+	vfs_t		*vfsp;
+	vnode_t		*vp;
+	int		error;
+
+	vfsp = LINVFS_GET_VFS(sb);
+	if (sb->s_flags & MS_RDONLY)
+		return;
+	VFS_ROOT(vfsp, &vp, error);
+	VOP_IOCTL(vp, LINVFS_GET_IP(vp), NULL, XFS_IOC_FREEZE, 0, error);
+	VN_RELE(vp);
+}
+
+void
+linvfs_unfreeze_fs(
+	struct super_block *sb)
+{
+	vfs_t		*vfsp;
+	vnode_t		*vp;
+	int		error;
+
+	vfsp = LINVFS_GET_VFS(sb);
+	VFS_ROOT(vfsp, &vp, error);
+	VOP_IOCTL(vp, LINVFS_GET_IP(vp), NULL, XFS_IOC_THAW, 0, error);
+	VN_RELE(vp);
+}
+
+
+struct dentry *linvfs_get_parent(struct dentry *child)
+{
+	int		error;
+	vnode_t		*vp, *cvp;
+	struct dentry	*parent;
+	struct inode	*ip = NULL;
+	struct dentry dotdot;
+
+	dotdot.d_name.name = "..";
+	dotdot.d_name.len = 2;
+	dotdot.d_inode = 0;
+
+	cvp = NULL;
+	vp = LINVFS_GET_VP(child->d_inode);
+	VOP_LOOKUP(vp, &dotdot, &cvp, 0, NULL, NULL, error);
+
+	if (!error) {
+		ASSERT(cvp);
+		ip = LINVFS_GET_IP(cvp);
+		if (!ip) {
+			VN_RELE(cvp);
+			return ERR_PTR(-EACCES);
+		}
+		error = -linvfs_revalidate_core(ip, ATTR_COMM);
+	}
+	if (error)
+		return ERR_PTR(-error);
+	parent = d_alloc_anon(ip);
+	if (!parent) {
+		VN_RELE(cvp);
+		parent = ERR_PTR(-ENOMEM);
+	}
+	return parent;
+}
+
+static struct super_block *linvfs_get_sb(struct file_system_type *fs_type,
+	int flags, char *dev_name, void *data)
+{
+	return get_sb_bdev(fs_type, flags, dev_name, data, linvfs_fill_super);
+}
+
+static struct export_operations linvfs_export_ops = {
+	.get_parent		= linvfs_get_parent,
+};
+
+int
+linvfs_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+	vfs_t		*vfsp;
+	xfs_mount_t	*mp;
+	static struct proc_xfs_info {
+		int flag;
+		char *str;
+	} xfs_info[] = {
+		/* the few simple ones we can get from the mount struct */
+		{ XFS_MOUNT_NOALIGN,		",noalign" },
+		{ XFS_MOUNT_NORECOVERY,		",norecovery" },
+		{ XFS_MOUNT_OSYNCISOSYNC,	",osyncisosync" },
+		{ XFS_MOUNT_NOUUID,		",nouuid" },
+		{ XFS_MOUNT_IRIXSGID,		",irixsgid" },
+		{ 0, NULL }
+	};
+
+	struct proc_xfs_info *xfs_infop;
+
+	vfsp = LINVFS_GET_VFS(mnt->mnt_sb);
+	mp = XFS_BHVTOM(vfsp->vfs_fbhv);
+
+	for (xfs_infop = xfs_info; xfs_infop->flag; xfs_infop++) {
+		if (mp->m_flags & xfs_infop->flag)
+			seq_puts(m, xfs_infop->str);
+	}
+
+	if (mp->m_qflags & XFS_UQUOTA_ACCT) {
+		seq_puts(m, ",uquota");
+		if (!(mp->m_qflags & XFS_UQUOTA_ENFD))
+			seq_puts(m, ",uqnoenforce");
+	}
+
+	if (mp->m_qflags & XFS_GQUOTA_ACCT) {
+		seq_puts(m, ",gquota");
+		if (!(mp->m_qflags & XFS_GQUOTA_ENFD))
+			seq_puts(m, ",gqnoenforce");
+	}
+
+	if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
+		seq_printf(m, ",biosize=%d", mp->m_writeio_log);
+
+	if (mp->m_logbufs > 0)
+		seq_printf(m, ",logbufs=%d", mp->m_logbufs);
+
+	if (mp->m_logbsize > 0)
+		seq_printf(m, ",logbsize=%d", mp->m_logbsize);
+
+	if (mp->m_ddev_targp->pbr_dev != mp->m_logdev_targp->pbr_dev)
+		seq_printf(m, ",logdev=%s",
+				bdevname(mp->m_logdev_targp->pbr_bdev));
+
+	if (mp->m_rtdev_targp &&
+	    mp->m_ddev_targp->pbr_dev != mp->m_rtdev_targp->pbr_dev)
+		seq_printf(m, ",rtdev=%s",
+				bdevname(mp->m_rtdev_targp->pbr_bdev));
+
+	if (mp->m_dalign > 0)
+		seq_printf(m, ",sunit=%d",
+				(int)XFS_FSB_TO_BB(mp, mp->m_dalign));
+
+	if (mp->m_swidth > 0)
+		seq_printf(m, ",swidth=%d",
+				(int)XFS_FSB_TO_BB(mp, mp->m_swidth));
+
+	if (vfsp->vfs_flag & VFS_DMI)
+		seq_puts(m, ",dmapi");
+
+	return 0;
+}
+
+static struct super_operations linvfs_sops = {
+	.alloc_inode		= linvfs_alloc_inode,
+	.destroy_inode		= linvfs_destroy_inode,
+	.write_inode		= linvfs_write_inode,
+	.put_inode		= linvfs_put_inode,
+	.clear_inode		= linvfs_clear_inode,
+	.put_super		= linvfs_put_super,
+	.write_super		= linvfs_write_super,
+	.write_super_lockfs	= linvfs_freeze_fs,
+	.unlockfs		= linvfs_unfreeze_fs,
+	.statfs			= linvfs_statfs,
+	.remount_fs		= linvfs_remount,
+	.show_options		= linvfs_show_options,
+};
+
+static struct file_system_type xfs_fs_type = {
+	.owner			= THIS_MODULE,
+	.name			= "xfs",
+	.get_sb			= linvfs_get_sb,
+	.kill_sb		= kill_block_super,
+	.fs_flags		= FS_REQUIRES_DEV,
+};
+
+static int __init init_xfs_fs(void)
+{
+	int error;
+	struct sysinfo	si;
+	static char message[] __initdata =
+		KERN_INFO "SGI XFS " XFS_VERSION_STRING " with " 
+		XFS_BUILD_OPTIONS " enabled\n";
+
+	error = init_inodecache();
+	if (error < 0)
+		return error;
+
+	error = pagebuf_init();
+	if (error < 0)
+		goto out;
+
+	si_meminfo(&si);
+	xfs_physmem = si.totalram;
+
+	printk(message);
+
+	vn_init();
+	xfs_init();
+	dmapi_init();
+
+	error = register_filesystem(&xfs_fs_type);
+	if (error)
+		goto out;
+	return 0;
+
+out:
+	destroy_inodecache();
+	return error;
+}
+
+
+static void __exit exit_xfs_fs(void)
+{
+	dmapi_uninit();
+	xfs_cleanup();
+	unregister_filesystem(&xfs_fs_type);
+	pagebuf_terminate();
+	destroy_inodecache();
+}
+
+module_init(init_xfs_fs);
+module_exit(exit_xfs_fs);
+
+MODULE_AUTHOR("SGI <sgi.com>");
+MODULE_DESCRIPTION("SGI XFS " XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled");
+MODULE_LICENSE("GPL");
diff --git a/fs/xfs/linux/xfs_super.h b/fs/xfs/linux/xfs_super.h
new file mode 100644
index 000000000000..e7d816985d9a
--- /dev/null
+++ b/fs/xfs/linux/xfs_super.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPER_H__
+#define __XFS_SUPER_H__
+
+#ifdef CONFIG_FS_POSIX_ACL
+# define XFS_ACL_STRING		"ACLs, "
+#else
+# define XFS_ACL_STRING
+#endif
+
+#ifdef CONFIG_XFS_DMAPI
+# define XFS_DMAPI_STRING	"DMAPI, "
+#else
+# define XFS_DMAPI_STRING
+#endif
+
+#ifdef CONFIG_XFS_QUOTA
+# define XFS_QUOTA_STRING	"quota, "
+#else
+# define XFS_QUOTA_STRING
+#endif
+
+#ifdef CONFIG_XFS_RT
+# define XFS_RT_STRING		"realtime, "
+#else
+# define XFS_RT_STRING
+#endif
+
+#ifdef CONFIG_XFS_VNODE_TRACING
+# define XFS_VNTRACE_STRING	"VN-trace, "
+#else
+# define XFS_VNTRACE_STRING
+#endif
+
+#ifdef XFSDEBUG
+# define XFS_DBG_STRING		"debug"
+#else
+# define XFS_DBG_STRING		"no debug"
+#endif
+
+#define XFS_BUILD_OPTIONS	XFS_ACL_STRING XFS_DMAPI_STRING \
+				XFS_RT_STRING \
+				XFS_QUOTA_STRING XFS_VNTRACE_STRING \
+				XFS_DBG_STRING /* DBG must be last */
+
+
+#define LINVFS_GET_VFS(s) \
+	(vfs_t *)((s)->u.generic_sbp)
+#define LINVFS_SET_VFS(s, vfsp) \
+	((s)->u.generic_sbp = vfsp)
+
+
+struct xfs_mount_args;
+
+extern void
+linvfs_set_inode_ops(
+	struct inode	*inode);
+
+extern int
+spectodevs(
+	struct super_block *sb,
+	struct xfs_mount_args *args,
+	dev_t		*ddevp,
+	dev_t		*logdevp,
+	dev_t		*rtdevp);
+
+#endif	/* __XFS_SUPER_H__ */
diff --git a/fs/xfs/linux/xfs_sysctl.c b/fs/xfs/linux/xfs_sysctl.c
new file mode 100644
index 000000000000..840810b33f27
--- /dev/null
+++ b/fs/xfs/linux/xfs_sysctl.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2001-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+
+/*
+ * Tunable xfs parameters
+ */
+
+extern struct xfsstats xfsstats;
+
+unsigned long xfs_min[XFS_PARAM] = {			 0,			 0, 0 };
+unsigned long xfs_max[XFS_PARAM] = { XFS_REFCACHE_SIZE_MAX,  XFS_REFCACHE_SIZE_MAX, 1 };
+
+xfs_param_t xfs_params = { 128, 32, 0 };
+
+static struct ctl_table_header *xfs_table_header;
+
+/* proc handlers */
+
+extern void xfs_refcache_resize(int xfs_refcache_new_size);
+
+static int
+xfs_refcache_resize_proc_handler(ctl_table *ctl, int write, struct file * filp,
+		       void *buffer, size_t *lenp)
+{
+	int	ret;
+	int	*valp = ctl->data;
+	int	xfs_refcache_new_size;
+	int	xfs_refcache_old_size = *valp;
+
+	ret = proc_doulongvec_minmax(ctl, write, filp, buffer, lenp);
+	xfs_refcache_new_size = *valp;
+
+	if (!ret && write && xfs_refcache_new_size != xfs_refcache_old_size) {
+		xfs_refcache_resize(xfs_refcache_new_size);
+		/* Don't purge more than size of the cache */
+		if (xfs_refcache_new_size < xfs_params.refcache_purge)
+			xfs_params.refcache_purge = xfs_refcache_new_size;
+	}
+
+	return ret;
+}
+
+static int
+xfs_stats_clear_proc_handler(ctl_table *ctl, int write, struct file * filp,
+		       void *buffer, size_t *lenp)
+{
+	int		ret;
+	int		*valp = ctl->data;
+	__uint32_t	vn_active;
+
+	ret = proc_doulongvec_minmax(ctl, write, filp, buffer, lenp);
+
+	if (!ret && write && *valp) {
+		printk("XFS Clearing xfsstats\n");
+		/* save vn_active, it's a universal truth! */
+		vn_active = xfsstats.vn_active;
+		memset(&xfsstats, 0, sizeof(xfsstats));
+		xfsstats.vn_active = vn_active;
+		xfs_params.stats_clear = 0;
+	}
+
+	return ret;
+}
+
+static ctl_table xfs_table[] = {
+	{XFS_REFCACHE_SIZE, "refcache_size", &xfs_params.refcache_size,
+	sizeof(ulong), 0644, NULL, &xfs_refcache_resize_proc_handler,
+	&sysctl_intvec, NULL, &xfs_min[0], &xfs_max[0]},
+
+	{XFS_REFCACHE_PURGE, "refcache_purge", &xfs_params.refcache_purge,
+	sizeof(ulong), 0644, NULL, &proc_doulongvec_minmax,
+	&sysctl_intvec, NULL, &xfs_min[1], &xfs_params.refcache_size},
+
+	{XFS_STATS_CLEAR, "stats_clear", &xfs_params.stats_clear,
+	sizeof(ulong), 0644, NULL, &xfs_stats_clear_proc_handler,
+	&sysctl_intvec, NULL, &xfs_min[2], &xfs_max[2]},
+
+	{0}
+};
+
+static ctl_table xfs_dir_table[] = {
+	{FS_XFS, "xfs", NULL, 0, 0555, xfs_table},
+	{0}
+};
+
+static ctl_table xfs_root_table[] = {
+	{CTL_FS, "fs",	NULL, 0, 0555, xfs_dir_table},
+	{0}
+};
+
+void
+xfs_sysctl_register(void)
+{
+	xfs_table_header = register_sysctl_table(xfs_root_table, 1);
+}
+
+void
+xfs_sysctl_unregister(void)
+{
+	if (xfs_table_header)
+		unregister_sysctl_table(xfs_table_header);
+}
diff --git a/fs/xfs/linux/xfs_sysctl.h b/fs/xfs/linux/xfs_sysctl.h
new file mode 100644
index 000000000000..6649017ec372
--- /dev/null
+++ b/fs/xfs/linux/xfs_sysctl.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2001-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#ifndef __XFS_SYSCTL_H__
+#define __XFS_SYSCTL_H__
+
+#include <linux/sysctl.h>
+
+/*
+ * Tunable xfs parameters
+ */
+
+#define XFS_PARAM	3
+
+typedef struct xfs_param {
+	ulong	refcache_size;	/* Size of nfs refcache */
+	ulong	refcache_purge; /* # of entries to purge each time */
+	ulong	stats_clear;	/* reset all xfs stats to 0 */
+} xfs_param_t;
+
+enum {
+	XFS_REFCACHE_SIZE = 1,
+	XFS_REFCACHE_PURGE = 2,
+	XFS_STATS_CLEAR = 3,
+};
+
+extern xfs_param_t	xfs_params;
+
+#ifdef CONFIG_SYSCTL
+extern void xfs_sysctl_register(void);
+extern void xfs_sysctl_unregister(void);
+#else
+static __inline void xfs_sysctl_register(void) { };
+static __inline void xfs_sysctl_unregister(void) { };
+#endif
+
+#endif /* __XFS_SYSCTL_H__ */
diff --git a/fs/xfs/linux/xfs_version.h b/fs/xfs/linux/xfs_version.h
new file mode 100644
index 000000000000..229bb86ee173
--- /dev/null
+++ b/fs/xfs/linux/xfs_version.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2001-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * Dummy file that can contain a timestamp to put into the
+ * XFS init string, to help users keep track of what they're
+ * running
+ */
+
+#ifndef __XFS_VERSION_H__
+#define __XFS_VERSION_H__
+
+#define XFS_VERSION_STRING "CVS"
+
+#endif /* __XFS_VERSION_H__ */
diff --git a/fs/xfs/linux/xfs_vfs.h b/fs/xfs/linux/xfs_vfs.h
new file mode 100644
index 000000000000..0f384eb8220f
--- /dev/null
+++ b/fs/xfs/linux/xfs_vfs.h
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_VFS_H__
+#define __XFS_VFS_H__
+
+#include <linux/vfs.h>
+
+struct statfs;
+struct vnode;
+struct cred;
+struct super_block;
+struct fid;
+struct dm_fcntl_vector;
+struct xfs_mount_args;
+
+typedef struct vfs {
+	u_int		vfs_flag;	/* flags */
+	fsid_t		vfs_fsid;	/* file system id */
+	fsid_t		*vfs_altfsid;	/* An ID fixed for life of FS */
+	bhv_head_t	vfs_bh;		/* head of vfs behavior chain */
+	struct super_block *vfs_super;	/* pointer to super block structure */
+} vfs_t;
+
+#define vfs_fbhv	vfs_bh.bh_first		/* 1st on vfs behavior chain */
+#define VFS_FOPS(vfsp)	\
+	((vfsops_t *)((vfsp)->vfs_fbhv->bd_ops))/* ops for 1st behavior */
+
+
+#define bhvtovfs(bdp)	((struct vfs *)BHV_VOBJ(bdp))
+#define VFS_BHVHEAD(vfsp) (&(vfsp)->vfs_bh)
+
+
+#define VFS_RDONLY	0x0001		/* read-only vfs */
+#define VFS_GRPID	0x0002		/* group-ID assigned from directory */
+#define VFS_DMI		0x0004		/* filesystem has the DMI enabled */
+
+#define SYNC_ATTR	0x0001		/* sync attributes */
+#define SYNC_CLOSE	0x0002		/* close file system down */
+#define SYNC_DELWRI	0x0004		/* look at delayed writes */
+#define SYNC_WAIT	0x0008		/* wait for i/o to complete */
+#define SYNC_FSDATA	0x0020		/* flush fs data (e.g. superblocks) */
+#define SYNC_BDFLUSH	0x0010		/* BDFLUSH is calling -- don't block */
+
+
+typedef struct vfsops {
+	int	(*vfs_mount)(struct vfs *, struct xfs_mount_args *,
+					struct cred *);
+					/* mount file system */
+	int	(*vfs_dounmount)(bhv_desc_t *, int, struct vnode *,
+				 struct cred *);
+					/* preparation and unmount */
+	int	(*vfs_unmount)(bhv_desc_t *, int, struct cred *);
+					/* unmount file system */
+	int	(*vfs_root)(bhv_desc_t *, struct vnode **);
+					/* get root vnode */
+	int	(*vfs_statvfs)(bhv_desc_t *, struct statfs *, struct vnode *);
+					/* get file system statistics */
+	int	(*vfs_sync)(bhv_desc_t *, int, struct cred *);
+					/* flush files */
+	int	(*vfs_vget)(bhv_desc_t *, struct vnode **, struct fid *);
+					/* get vnode from fid */
+	int	(*vfs_dmapi_mount)(struct vfs *, char *, char *);
+					/* send dmapi mount event */
+	int	(*vfs_dmapi_fsys_vector)(bhv_desc_t *,
+					 struct dm_fcntl_vector *);
+	void	(*vfs_force_shutdown)(bhv_desc_t *,
+					int, char *, int);
+} vfsops_t;
+
+#define VFS_DOUNMOUNT(vfsp,f,vp,cr, rv) \
+{	\
+	BHV_READ_LOCK(&(vfsp)->vfs_bh); \
+	rv = (*(VFS_FOPS(vfsp)->vfs_dounmount))((vfsp)->vfs_fbhv, f, vp, cr);	\
+	BHV_READ_UNLOCK(&(vfsp)->vfs_bh); \
+}
+#define VFS_UNMOUNT(vfsp,f,cr, rv)	\
+{	\
+	BHV_READ_LOCK(&(vfsp)->vfs_bh); \
+	rv = (*(VFS_FOPS(vfsp)->vfs_unmount))((vfsp)->vfs_fbhv, f, cr);		\
+	BHV_READ_UNLOCK(&(vfsp)->vfs_bh); \
+}
+#define VFS_ROOT(vfsp, vpp, rv)		\
+{	\
+	BHV_READ_LOCK(&(vfsp)->vfs_bh); \
+	rv = (*(VFS_FOPS(vfsp)->vfs_root))((vfsp)->vfs_fbhv, vpp);	\
+	BHV_READ_UNLOCK(&(vfsp)->vfs_bh); \
+}
+#define VFS_STATVFS(vfsp, sp, vp, rv)	\
+{	\
+	BHV_READ_LOCK(&(vfsp)->vfs_bh); \
+	rv = (*(VFS_FOPS(vfsp)->vfs_statvfs))((vfsp)->vfs_fbhv, sp, vp);	\
+	BHV_READ_UNLOCK(&(vfsp)->vfs_bh); \
+}
+#define VFS_SYNC(vfsp, flag, cr, rv) \
+{	\
+	BHV_READ_LOCK(&(vfsp)->vfs_bh); \
+	rv = (*(VFS_FOPS(vfsp)->vfs_sync))((vfsp)->vfs_fbhv, flag, cr); \
+	BHV_READ_UNLOCK(&(vfsp)->vfs_bh); \
+}
+#define VFS_VGET(vfsp, vpp, fidp, rv) \
+{	\
+	BHV_READ_LOCK(&(vfsp)->vfs_bh); \
+	rv = (*(VFS_FOPS(vfsp)->vfs_vget))((vfsp)->vfs_fbhv, vpp, fidp);  \
+	BHV_READ_UNLOCK(&(vfsp)->vfs_bh); \
+}
+/* No behavior lock here */
+#define VFS_FORCE_SHUTDOWN(vfsp, flags) \
+	(*(VFS_FOPS(vfsp)->vfs_force_shutdown))((vfsp)->vfs_fbhv, flags, __FILE__, __LINE__);
+
+#define VFS_DMAPI_FSYS_VECTOR(vfsp, df, rv) \
+{	\
+	BHV_READ_LOCK(&(vfsp)->vfs_bh); \
+	rv = (*(VFS_FOPS(vfsp)->vfs_dmapi_fsys_vector))((vfsp)->vfs_fbhv, df);	      \
+	BHV_READ_UNLOCK(&(vfsp)->vfs_bh); \
+}
+
+
+#define VFSOPS_DMAPI_MOUNT(vfs_op, vfsp, dir_name, fsname, rv) \
+	rv = (*(vfs_op)->vfs_dmapi_mount)(vfsp, dir_name, fsname)
+#define VFSOPS_MOUNT(vfs_op, vfsp, args, cr, rv) \
+	rv = (*(vfs_op)->vfs_mount)(vfsp, args, cr)
+
+#define VFS_REMOVEBHV(vfsp, bdp)\
+{	\
+	bhv_remove(VFS_BHVHEAD(vfsp), bdp); \
+}
+
+#define PVFS_UNMOUNT(bdp,f,cr, rv)	\
+{	\
+	rv = (*((vfsops_t *)(bdp)->bd_ops)->vfs_unmount)(bdp, f, cr);	\
+}
+
+#define PVFS_SYNC(bdp, flag, cr, rv) \
+{	\
+	rv = (*((vfsops_t *)(bdp)->bd_ops)->vfs_sync)(bdp, flag, cr);	\
+}
+
+
+static __inline vfs_t *
+vfs_allocate(void)
+{
+	vfs_t	*vfsp;
+
+	vfsp = kmalloc(sizeof(vfs_t), GFP_KERNEL);
+	if (vfsp) {
+		memset(vfsp, 0, sizeof(vfs_t));
+		bhv_head_init(VFS_BHVHEAD(vfsp), "vfs");
+	}
+	return (vfsp);
+}
+
+static __inline void
+vfs_deallocate(
+	vfs_t		*vfsp)
+{
+	bhv_head_destroy(VFS_BHVHEAD(vfsp));
+	kfree(vfsp);
+}
+
+/*
+ * Called by fs dependent VFS_MOUNT code to link the VFS base file system
+ * dependent behavior with the VFS virtual object.
+ */
+static __inline void
+vfs_insertbhv(
+	vfs_t		*vfsp,
+	bhv_desc_t	*bdp,
+	vfsops_t	*vfsops,
+	void		*mount)
+{
+	/*
+	 * Initialize behavior desc with ops and data and then
+	 * attach it to the vfs.
+	 */
+	bhv_desc_init(bdp, mount, vfsp, vfsops);
+	bhv_insert_initial(&vfsp->vfs_bh, bdp);
+}
+
+#endif	/* __XFS_VFS_H__ */
diff --git a/fs/xfs/linux/xfs_vnode.c b/fs/xfs/linux/xfs_vnode.c
new file mode 100644
index 000000000000..0d4cb5ea14eb
--- /dev/null
+++ b/fs/xfs/linux/xfs_vnode.c
@@ -0,0 +1,468 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <linux/pagemap.h>
+
+
+uint64_t vn_generation;		/* vnode generation number */
+
+spinlock_t	vnumber_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * Dedicated vnode inactive/reclaim sync semaphores.
+ * Prime number of hash buckets since address is used as the key.
+ */
+#define NVSYNC			37
+#define vptosync(v)		(&vsync[((unsigned long)v) % NVSYNC])
+sv_t vsync[NVSYNC];
+
+/*
+ * Translate stat(2) file types to vnode types and vice versa.
+ * Aware of numeric order of S_IFMT and vnode type values.
+ */
+enum vtype iftovt_tab[] = {
+	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
+	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
+};
+
+u_short vttoif_tab[] = {
+	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO, 0, S_IFSOCK
+};
+
+#define VN_LOCK(vp)		spin_lock(&(vp)->v_lock)
+#define VN_UNLOCK(vp)		spin_unlock(&(vp)->v_lock)
+
+
+void
+vn_init(void)
+{
+	register sv_t *svp;
+	register int i;
+
+	for (svp = vsync, i = 0; i < NVSYNC; i++, svp++)
+		init_sv(svp, SV_DEFAULT, "vsy", i);
+}
+
+
+/*
+ * Clean a vnode of filesystem-specific data and prepare it for reuse.
+ */
+STATIC int
+vn_reclaim(struct vnode *vp)
+{
+	int error;
+
+	XFS_STATS_INC(xfsstats.vn_reclaim);
+
+	vn_trace_entry(vp, "vn_reclaim", (inst_t *)__return_address);
+
+	/*
+	 * Only make the VOP_RECLAIM call if there are behaviors
+	 * to call.
+	 */
+	if (vp->v_fbhv != NULL) {
+		VOP_RECLAIM(vp, error);
+		if (error)
+			return -error;
+	}
+	ASSERT(vp->v_fbhv == NULL);
+
+	VN_LOCK(vp);
+
+	vp->v_flag &= (VRECLM|VWAIT);
+	VN_UNLOCK(vp);
+
+	vp->v_type = VNON;
+	vp->v_fbhv = NULL;
+
+#ifdef CONFIG_XFS_VNODE_TRACING
+	ktrace_free(vp->v_trace);
+	vp->v_trace = NULL;
+#endif
+
+	return 0;
+}
+
+STATIC void
+vn_wakeup(struct vnode *vp)
+{
+	VN_LOCK(vp);
+	if (vp->v_flag & VWAIT) {
+		sv_broadcast(vptosync(vp));
+	}
+	vp->v_flag &= ~(VRECLM|VWAIT|VMODIFIED);
+	VN_UNLOCK(vp);
+}
+
+int
+vn_wait(struct vnode *vp)
+{
+	VN_LOCK(vp);
+	if (vp->v_flag & (VINACT | VRECLM)) {
+		vp->v_flag |= VWAIT;
+		sv_wait(vptosync(vp), PINOD, &vp->v_lock, 0);
+		return 1;
+	}
+	VN_UNLOCK(vp);
+	return 0;
+}
+
+struct vnode *
+vn_initialize(struct inode *inode)
+{
+	struct vnode	*vp = LINVFS_GET_VP(inode);
+
+	XFS_STATS_INC(xfsstats.vn_active);
+
+	vp->v_flag = VMODIFIED;
+	spinlock_init(&vp->v_lock, "v_lock");
+
+	spin_lock(&vnumber_lock);
+	if (!++vn_generation)	/* v_number shouldn't be zero */
+		vn_generation++;
+	vp->v_number = vn_generation;
+	spin_unlock(&vnumber_lock);
+
+	ASSERT(VN_CACHED(vp) == 0);
+
+	/* Initialize the first behavior and the behavior chain head. */
+	vn_bhv_head_init(VN_BHV_HEAD(vp), "vnode");
+
+#ifdef	CONFIG_XFS_VNODE_TRACING
+	vp->v_trace = ktrace_alloc(VNODE_TRACE_SIZE, KM_SLEEP);
+#endif	/* CONFIG_XFS_VNODE_TRACING */
+
+	vn_trace_exit(vp, "vn_initialize", (inst_t *)__return_address);
+	return vp;
+}
+
+/*
+ * Get a reference on a vnode.
+ */
+vnode_t *
+vn_get(struct vnode *vp, vmap_t *vmap)
+{
+	struct inode	*inode;
+
+	XFS_STATS_INC(xfsstats.vn_get);
+	inode = LINVFS_GET_IP(vp);
+	if (inode->i_state & I_FREEING)
+		return NULL;
+
+	inode = iget_locked(vmap->v_vfsp->vfs_super, vmap->v_ino);
+	if (inode == NULL)		/* Inode not present */
+		return NULL;
+
+	/* We do not want to create new inodes via vn_get,
+	 * returning NULL here is OK.
+	 */
+	if (inode->i_state & I_NEW) {
+		make_bad_inode(inode);
+		unlock_new_inode(inode);
+		iput(inode);
+		return NULL;
+	}
+
+	vn_trace_exit(vp, "vn_get", (inst_t *)__return_address);
+	ASSERT((vp->v_flag & VPURGE) == 0);
+
+	return vp;
+}
+
+/*
+ * "revalidate" the linux inode.
+ */
+int
+vn_revalidate(struct vnode *vp, int flags)
+{
+	int		error;
+	struct inode	*inode;
+	vattr_t		va;
+
+	vn_trace_entry(vp, "vn_revalidate", (inst_t *)__return_address);
+
+	va.va_mask = AT_STAT|AT_GENCOUNT;
+
+	ASSERT(vp->v_bh.bh_first != NULL);
+
+	VOP_GETATTR(vp, &va, flags & ATTR_LAZY, NULL, error);
+
+	if (! error) {
+		inode = LINVFS_GET_IP(vp);
+		ASSERT(inode);
+
+		inode->i_mode	    = VTTOIF(va.va_type) | va.va_mode;
+		inode->i_nlink	    = va.va_nlink;
+		inode->i_uid	    = va.va_uid;
+		inode->i_gid	    = va.va_gid;
+		inode->i_rdev	    = mk_kdev(MAJOR(va.va_rdev),
+						MINOR(va.va_rdev));
+		inode->i_blksize    = PAGE_CACHE_SIZE;
+		inode->i_generation = va.va_gencount;
+		if ((flags & ATTR_COMM) ||
+		    S_ISREG(inode->i_mode) ||
+		    S_ISDIR(inode->i_mode) ||
+		    S_ISLNK(inode->i_mode)) {
+			inode->i_size	    = va.va_size;
+			inode->i_blocks	    = va.va_nblocks;
+			inode->i_atime	    = va.va_atime.tv_sec;
+			inode->i_mtime	    = va.va_mtime.tv_sec;
+			inode->i_ctime	    = va.va_ctime.tv_sec;
+		}
+		if (flags & ATTR_LAZY)
+			vp->v_flag &= ~VMODIFIED;
+		else
+			VUNMODIFY(vp);
+	} else {
+		vn_trace_exit(vp, "vn_revalidate.error",
+					(inst_t *)__return_address);
+	}
+
+	return -error;
+}
+
+
+/*
+ * purge a vnode from the cache
+ * At this point the vnode is guaranteed to have no references (vn_count == 0)
+ * The caller has to make sure that there are no ways someone could
+ * get a handle (via vn_get) on the vnode (usually done via a mount/vfs lock).
+ */
+void
+vn_purge(struct vnode *vp, vmap_t *vmap)
+{
+	vn_trace_entry(vp, "vn_purge", (inst_t *)__return_address);
+
+	ASSERT(vp->v_flag & VPURGE);
+
+again:
+	/*
+	 * Check whether vp has already been reclaimed since our caller
+	 * sampled its version while holding a filesystem cache lock that
+	 * its VOP_RECLAIM function acquires.
+	 */
+	VN_LOCK(vp);
+	if (vp->v_number != vmap->v_number) {
+		VN_UNLOCK(vp);
+		return;
+	}
+
+	/*
+	 * If vp is being reclaimed or inactivated, wait until it is inert,
+	 * then proceed.  Can't assume that vnode is actually reclaimed
+	 * just because the reclaimed flag is asserted -- a vn_alloc
+	 * reclaim can fail.
+	 */
+	if (vp->v_flag & (VINACT | VRECLM)) {
+		ASSERT(vn_count(vp) == 0);
+		vp->v_flag |= VWAIT;
+		sv_wait(vptosync(vp), PINOD, &vp->v_lock, 0);
+		goto again;
+	}
+
+	/*
+	 * Another process could have raced in and gotten this vnode...
+	 */
+	if (vn_count(vp) > 0) {
+		VN_UNLOCK(vp);
+		return;
+	}
+
+	XFS_STATS_DEC(xfsstats.vn_active);
+	vp->v_flag |= VRECLM;
+	VN_UNLOCK(vp);
+
+	/*
+	 * Call VOP_RECLAIM and clean vp. The FSYNC_INVAL flag tells
+	 * vp's filesystem to flush and invalidate all cached resources.
+	 * When vn_reclaim returns, vp should have no private data,
+	 * either in a system cache or attached to v_data.
+	 */
+	if (vn_reclaim(vp) != 0)
+		panic("vn_purge: cannot reclaim");
+
+	/*
+	 * Wakeup anyone waiting for vp to be reclaimed.
+	 */
+	vn_wakeup(vp);
+}
+
+/*
+ * Add a reference to a referenced vnode.
+ */
+struct vnode *
+vn_hold(struct vnode *vp)
+{
+	struct inode *inode;
+
+	XFS_STATS_INC(xfsstats.vn_hold);
+
+	VN_LOCK(vp);
+	inode = igrab(LINVFS_GET_IP(vp));
+	ASSERT(inode);
+	VN_UNLOCK(vp);
+
+	return vp;
+}
+
+/*
+ *  Call VOP_INACTIVE on last reference.
+ */
+void
+vn_rele(struct vnode *vp)
+{
+	int	vcnt;
+	/* REFERENCED */
+	int cache;
+
+	XFS_STATS_INC(xfsstats.vn_rele);
+
+
+	VN_LOCK(vp);
+
+	vn_trace_entry(vp, "vn_rele", (inst_t *)__return_address);
+	vcnt = vn_count(vp);
+
+	/*
+	 * Since we always get called from put_inode we know
+	 * that i_count won't be decremented after we
+	 * return.
+	 */
+	if (vcnt == 0) {
+		/*
+		 * As soon as we turn this on, noone can find us in vn_get
+		 * until we turn off VINACT or VRECLM
+		 */
+		vp->v_flag |= VINACT;
+		VN_UNLOCK(vp);
+
+		/*
+		 * Do not make the VOP_INACTIVE call if there
+		 * are no behaviors attached to the vnode to call.
+		 */
+		if (vp->v_fbhv != NULL) {
+			VOP_INACTIVE(vp, NULL, cache);
+		}
+
+		VN_LOCK(vp);
+		if (vp->v_flag & VWAIT) {
+			if (vp->v_flag & VWAIT) {
+				sv_broadcast(vptosync(vp));
+			}
+		}
+
+		vp->v_flag &= ~(VINACT|VWAIT|VRECLM|VMODIFIED);
+
+	}
+
+	VN_UNLOCK(vp);
+
+	vn_trace_exit(vp, "vn_rele", (inst_t *)__return_address);
+}
+
+
+/*
+ * Finish the removal of a vnode.
+ */
+void
+vn_remove(struct vnode *vp)
+{
+	/* REFERENCED */
+	vmap_t	vmap;
+
+	/* Make sure we don't do this to the same vnode twice */
+	if (!(vp->v_fbhv))
+		return;
+
+	XFS_STATS_INC(xfsstats.vn_remove);
+
+	vn_trace_exit(vp, "vn_remove", (inst_t *)__return_address);
+
+	/*
+	 * After the following purge the vnode
+	 * will no longer exist.
+	 */
+	VMAP(vp, XFS_BHVTOI(vp->v_fbhv), vmap);
+
+	vn_purge(vp, &vmap);
+}
+
+
+#ifdef	CONFIG_XFS_VNODE_TRACING
+
+#define KTRACE_ENTER(vp, vk, s, line, ra)			\
+	ktrace_enter(	(vp)->v_trace,				\
+/*  0 */		(void *)(__psint_t)(vk),		\
+/*  1 */		(void *)(s),				\
+/*  2 */		(void *)(__psint_t) line,		\
+/*  3 */		(void *)(vn_count(vp)), \
+/*  4 */		(void *)(ra),				\
+/*  5 */		(void *)(__psunsigned_t)(vp)->v_flag,	\
+/*  6 */		(void *)(__psint_t)smp_processor_id(),	\
+/*  7 */		(void *)(__psint_t)(current->pid),	\
+/*  8 */		(void *)__return_address,		\
+/*  9 */		0, 0, 0, 0, 0, 0, 0)
+
+/*
+ * Vnode tracing code.
+ */
+void
+vn_trace_entry(vnode_t *vp, char *func, inst_t *ra)
+{
+	KTRACE_ENTER(vp, VNODE_KTRACE_ENTRY, func, 0, ra);
+}
+
+void
+vn_trace_exit(vnode_t *vp, char *func, inst_t *ra)
+{
+	KTRACE_ENTER(vp, VNODE_KTRACE_EXIT, func, 0, ra);
+}
+
+void
+vn_trace_hold(vnode_t *vp, char *file, int line, inst_t *ra)
+{
+	KTRACE_ENTER(vp, VNODE_KTRACE_HOLD, file, line, ra);
+}
+
+void
+vn_trace_ref(vnode_t *vp, char *file, int line, inst_t *ra)
+{
+	KTRACE_ENTER(vp, VNODE_KTRACE_REF, file, line, ra);
+}
+
+void
+vn_trace_rele(vnode_t *vp, char *file, int line, inst_t *ra)
+{
+	KTRACE_ENTER(vp, VNODE_KTRACE_RELE, file, line, ra);
+}
+#endif	/* CONFIG_XFS_VNODE_TRACING */
diff --git a/fs/xfs/linux/xfs_vnode.h b/fs/xfs/linux/xfs_vnode.h
new file mode 100644
index 000000000000..e9ec89741910
--- /dev/null
+++ b/fs/xfs/linux/xfs_vnode.h
@@ -0,0 +1,765 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_VNODE_H__
+#define __XFS_VNODE_H__
+
+/*
+ * Vnode types (unrelated to on-disk inodes).  VNON means no type.
+ */
+typedef enum vtype {
+	VNON	= 0,
+	VREG	= 1,
+	VDIR	= 2,
+	VBLK	= 3,
+	VCHR	= 4,
+	VLNK	= 5,
+	VFIFO	= 6,
+	VBAD	= 7,
+	VSOCK	= 8
+} vtype_t;
+
+typedef __u64	vnumber_t;
+
+/*
+ * Define the type of behavior head used by vnodes.
+ */
+#define vn_bhv_head_t	bhv_head_t
+
+/*
+ * MP locking protocols:
+ *	v_flag, v_count				VN_LOCK/VN_UNLOCK
+ *	v_vfsp					VN_LOCK/VN_UNLOCK
+ *	v_type					read-only or fs-dependent
+ *	v_list, v_hashp, v_hashn		freelist lock
+ */
+typedef struct vnode {
+	__u32		v_flag;			/* vnode flags (see below) */
+	enum vtype	v_type;			/* vnode type		*/
+	struct vfs	*v_vfsp;		/* ptr to containing VFS*/
+	vnumber_t	v_number;		/* in-core vnode number */
+	vn_bhv_head_t	v_bh;			/* behavior head */
+
+	spinlock_t	v_lock;			/* don't use VLOCK on Linux */
+	struct inode	v_inode;		/* linux inode */
+#ifdef	CONFIG_XFS_VNODE_TRACING
+	struct ktrace	*v_trace;		/* trace header structure    */
+#endif	/* CONFIG_XFS_VNODE_TRACING */
+} vnode_t;
+
+/*
+ * Vnode to Linux inode mapping.
+ */
+#define LINVFS_GET_VP(inode)	((vnode_t *)list_entry(inode, vnode_t, v_inode))
+#define LINVFS_GET_IP(vp)	(&(vp)->v_inode)
+
+/*
+ * Conversion between vnode types/modes and encoded type/mode as
+ * seen by stat(2) and mknod(2).
+ */
+extern enum vtype	iftovt_tab[];
+extern ushort		vttoif_tab[];
+#define IFTOVT(M)	(iftovt_tab[((M) & S_IFMT) >> 12])
+#define VTTOIF(T)	(vttoif_tab[(int)(T)])
+#define MAKEIMODE(T, M) (VTTOIF(T) | ((M) & ~S_IFMT))
+
+/*
+ * Vnode flags.
+ *
+ * The vnode flags fall into two categories:
+ * 1) Local only -
+ *    Flags that are relevant only to a particular cell
+ * 2) Single system image -
+ *    Flags that must be maintained coherent across all cells
+ */
+ /* Local only flags */
+#define VINACT		       0x1	/* vnode is being inactivated	*/
+#define VRECLM		       0x2	/* vnode is being reclaimed	*/
+#define VWAIT		       0x4	/* waiting for VINACT
+					   or VRECLM to finish */
+#define VMODIFIED	       0x8	/* xfs inode state possibly different
+					 * from linux inode state.
+					 */
+
+/* Single system image flags */
+#define VROOT		  0x100000	/* root of its file system	*/
+#define VNOSWAP		  0x200000	/* cannot be used as virt swap device */
+#define VISSWAP		  0x400000	/* vnode is part of virt swap device */
+#define VREPLICABLE	  0x800000	/* Vnode can have replicated pages */
+#define VNONREPLICABLE	 0x1000000	/* Vnode has writers. Don't replicate */
+#define VDOCMP		 0x2000000	/* Vnode has special VOP_CMP impl. */
+#define VSHARE		 0x4000000	/* vnode part of global cache	*/
+					/* VSHARE applies to local cell only */
+#define VFRLOCKS	 0x8000000	/* vnode has FR locks applied	*/
+#define VENF_LOCKING	0x10000000	/* enf. mode FR locking in effect */
+#define VOPLOCK		0x20000000	/* oplock set on the vnode	*/
+#define VPURGE		0x40000000	/* In the linux 'put' thread	*/
+
+typedef enum vrwlock	{ VRWLOCK_NONE, VRWLOCK_READ,
+			  VRWLOCK_WRITE, VRWLOCK_WRITE_DIRECT,
+			  VRWLOCK_TRY_READ, VRWLOCK_TRY_WRITE } vrwlock_t;
+
+/*
+ * Return values for VOP_INACTIVE.  A return value of
+ * VN_INACTIVE_NOCACHE implies that the file system behavior
+ * has disassociated its state and bhv_desc_t from the vnode.
+ */
+#define VN_INACTIVE_CACHE	0
+#define VN_INACTIVE_NOCACHE	1
+
+/*
+ * Values for the cmd code given to VOP_VNODE_CHANGE.
+ */
+typedef enum vchange {
+	VCHANGE_FLAGS_FRLOCKS		= 0,
+	VCHANGE_FLAGS_ENF_LOCKING	= 1,
+	VCHANGE_FLAGS_TRUNCATED		= 2,
+	VCHANGE_FLAGS_PAGE_DIRTY	= 3,
+	VCHANGE_FLAGS_IOEXCL_COUNT	= 4
+} vchange_t;
+
+/*
+ * Macros for dealing with the behavior descriptor inside of the vnode.
+ */
+#define BHV_TO_VNODE(bdp)	((vnode_t *)BHV_VOBJ(bdp))
+#define BHV_TO_VNODE_NULL(bdp)	((vnode_t *)BHV_VOBJNULL(bdp))
+
+#define VNODE_TO_FIRST_BHV(vp)		(BHV_HEAD_FIRST(&(vp)->v_bh))
+#define VN_BHV_HEAD(vp)			((vn_bhv_head_t *)(&((vp)->v_bh)))
+#define VN_BHV_READ_LOCK(bhp)		BHV_READ_LOCK(bhp)
+#define VN_BHV_READ_UNLOCK(bhp)		BHV_READ_UNLOCK(bhp)
+#define VN_BHV_WRITE_LOCK(bhp)		BHV_WRITE_LOCK(bhp)
+#define VN_BHV_NOT_READ_LOCKED(bhp)	BHV_NOT_READ_LOCKED(bhp)
+#define VN_BHV_NOT_WRITE_LOCKED(bhp)	BHV_NOT_WRITE_LOCKED(bhp)
+#define vn_bhv_head_init(bhp,name)	bhv_head_init(bhp,name)
+#define vn_bhv_head_reinit(bhp)		bhv_head_reinit(bhp)
+#define vn_bhv_insert_initial(bhp,bdp)	bhv_insert_initial(bhp,bdp)
+#define vn_bhv_remove(bhp,bdp)		bhv_remove(bhp,bdp)
+#define vn_bhv_lookup(bhp,ops)		bhv_lookup(bhp,ops)
+#define vn_bhv_lookup_unlocked(bhp,ops) bhv_lookup_unlocked(bhp,ops)
+
+#define v_fbhv		v_bh.bh_first	       /* first behavior */
+#define v_fops		v_bh.bh_first->bd_ops  /* ops for first behavior */
+
+
+union rval;
+struct uio;
+struct file;
+struct vattr;
+struct page_buf_bmap_s;
+struct attrlist_cursor_kern;
+
+typedef int	(*vop_open_t)(bhv_desc_t *, struct cred *);
+typedef ssize_t (*vop_read_t)(bhv_desc_t *, struct file *, char *, size_t,
+				loff_t *, struct cred *);
+typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct file *, const char *, size_t,
+				loff_t *, struct cred *);
+typedef int	(*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *, unsigned int, unsigned long);
+typedef int	(*vop_getattr_t)(bhv_desc_t *, struct vattr *, int,
+				struct cred *);
+typedef int	(*vop_setattr_t)(bhv_desc_t *, struct vattr *, int,
+				struct cred *);
+typedef int	(*vop_access_t)(bhv_desc_t *, int, struct cred *);
+typedef int	(*vop_lookup_t)(bhv_desc_t *, struct dentry *, vnode_t **,
+				int, vnode_t *, struct cred *);
+typedef int	(*vop_create_t)(bhv_desc_t *, struct dentry *, struct vattr *,
+				vnode_t **, struct cred *);
+typedef int	(*vop_remove_t)(bhv_desc_t *, struct dentry *, struct cred *);
+typedef int	(*vop_link_t)(bhv_desc_t *, vnode_t *, struct dentry *,
+				struct cred *);
+typedef int	(*vop_rename_t)(bhv_desc_t *, struct dentry *, vnode_t *,
+				struct dentry *, struct cred *);
+typedef int	(*vop_mkdir_t)(bhv_desc_t *, struct dentry *, struct vattr *,
+				vnode_t **, struct cred *);
+typedef	int	(*vop_rmdir_t)(bhv_desc_t *, struct dentry *, struct cred *);
+typedef int	(*vop_readdir_t)(bhv_desc_t *, struct uio *, struct cred *,
+				int *);
+typedef int	(*vop_symlink_t)(bhv_desc_t *, struct dentry *,
+				struct vattr *, char *,
+				vnode_t **, struct cred *);
+typedef int	(*vop_readlink_t)(bhv_desc_t *, struct uio *, struct cred *);
+typedef int	(*vop_fsync_t)(bhv_desc_t *, int, struct cred *, xfs_off_t, xfs_off_t);
+typedef int	(*vop_inactive_t)(bhv_desc_t *, struct cred *);
+typedef int	(*vop_fid2_t)(bhv_desc_t *, struct fid *);
+typedef int	(*vop_release_t)(bhv_desc_t *);
+typedef int	(*vop_rwlock_t)(bhv_desc_t *, vrwlock_t);
+typedef void	(*vop_rwunlock_t)(bhv_desc_t *, vrwlock_t);
+typedef int	(*vop_bmap_t)(bhv_desc_t *, xfs_off_t, ssize_t, int, struct cred *, struct page_buf_bmap_s *, int *);
+typedef int	(*vop_strategy_t)(bhv_desc_t *, xfs_off_t, ssize_t, int, struct cred *, struct page_buf_bmap_s *, int *);
+typedef int	(*vop_reclaim_t)(bhv_desc_t *);
+typedef int	(*vop_attr_get_t)(bhv_desc_t *, char *, char *, int *, int,
+				struct cred *);
+typedef int	(*vop_attr_set_t)(bhv_desc_t *, char *, char *, int, int,
+				struct cred *);
+typedef int	(*vop_attr_remove_t)(bhv_desc_t *, char *, int, struct cred *);
+typedef int	(*vop_attr_list_t)(bhv_desc_t *, char *, int, int,
+				struct attrlist_cursor_kern *, struct cred *);
+typedef void	(*vop_link_removed_t)(bhv_desc_t *, vnode_t *, int);
+typedef void	(*vop_vnode_change_t)(bhv_desc_t *, vchange_t, __psint_t);
+typedef void	(*vop_ptossvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
+typedef void	(*vop_pflushinvalvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int);
+typedef int	(*vop_pflushvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, uint64_t, int);
+typedef int	(*vop_iflush_t)(bhv_desc_t *, int);
+
+
+typedef struct vnodeops {
+	bhv_position_t	vn_position;	/* position within behavior chain */
+	vop_open_t		vop_open;
+	vop_read_t		vop_read;
+	vop_write_t		vop_write;
+	vop_ioctl_t		vop_ioctl;
+	vop_getattr_t		vop_getattr;
+	vop_setattr_t		vop_setattr;
+	vop_access_t		vop_access;
+	vop_lookup_t		vop_lookup;
+	vop_create_t		vop_create;
+	vop_remove_t		vop_remove;
+	vop_link_t		vop_link;
+	vop_rename_t		vop_rename;
+	vop_mkdir_t		vop_mkdir;
+	vop_rmdir_t		vop_rmdir;
+	vop_readdir_t		vop_readdir;
+	vop_symlink_t		vop_symlink;
+	vop_readlink_t		vop_readlink;
+	vop_fsync_t		vop_fsync;
+	vop_inactive_t		vop_inactive;
+	vop_fid2_t		vop_fid2;
+	vop_rwlock_t		vop_rwlock;
+	vop_rwunlock_t		vop_rwunlock;
+	vop_bmap_t		vop_bmap;
+	vop_strategy_t		vop_strategy;
+	vop_reclaim_t		vop_reclaim;
+	vop_attr_get_t		vop_attr_get;
+	vop_attr_set_t		vop_attr_set;
+	vop_attr_remove_t	vop_attr_remove;
+	vop_attr_list_t		vop_attr_list;
+	vop_link_removed_t	vop_link_removed;
+	vop_vnode_change_t	vop_vnode_change;
+	vop_ptossvp_t		vop_tosspages;
+	vop_pflushinvalvp_t	vop_flushinval_pages;
+	vop_pflushvp_t		vop_flush_pages;
+	vop_release_t		vop_release;
+	vop_iflush_t		vop_iflush;
+} vnodeops_t;
+
+/*
+ * VOP's.
+ */
+#define _VOP_(op, vp)	(*((vnodeops_t *)(vp)->v_fops)->op)
+
+/*
+ * Be careful with VOP_OPEN, since we're holding the chain lock on the
+ * original vnode and VOP_OPEN semantic allows the new vnode to be returned
+ * in vpp. The practice of passing &vp for vpp just doesn't work.
+ */
+#define VOP_READ(vp,file,buf,size,offset,cr,rv)				\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_read, vp)((vp)->v_fbhv,file,buf,size,offset,cr); \
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_WRITE(vp,file,buf,size,offset,cr,rv)			\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,buf,size,offset,cr);\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_BMAP(vp,of,sz,rw,cr,b,n,rv)					\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,cr,b,n);		\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_STRATEGY(vp,of,sz,rw,cr,b,n,rv)				\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_strategy, vp)((vp)->v_fbhv,of,sz,rw,cr,b,n);	\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_OPEN(vp, cr, rv)						\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_open, vp)((vp)->v_fbhv, cr);			\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_GETATTR(vp, vap, f, cr, rv)					\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_getattr, vp)((vp)->v_fbhv, vap, f, cr);		\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_SETATTR(vp, vap, f, cr, rv)					\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_setattr, vp)((vp)->v_fbhv, vap, f, cr);		\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_ACCESS(vp, mode, cr, rv)					\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_access, vp)((vp)->v_fbhv, mode, cr);		\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_LOOKUP(vp,d,vpp,f,rdir,cr,rv)				\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_lookup, vp)((vp)->v_fbhv,d,vpp,f,rdir,cr);	\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_CREATE(dvp,d,vap,vpp,cr,rv)					\
+{									\
+	VN_BHV_READ_LOCK(&(dvp)->v_bh);					\
+	rv = _VOP_(vop_create, dvp)((dvp)->v_fbhv,d,vap,vpp,cr);	\
+	VN_BHV_READ_UNLOCK(&(dvp)->v_bh);				\
+}
+#define VOP_REMOVE(dvp,d,cr,rv)						\
+{									\
+	VN_BHV_READ_LOCK(&(dvp)->v_bh);					\
+	rv = _VOP_(vop_remove, dvp)((dvp)->v_fbhv,d,cr);		\
+	VN_BHV_READ_UNLOCK(&(dvp)->v_bh);				\
+}
+#define VOP_LINK(tdvp,fvp,d,cr,rv)					\
+{									\
+	VN_BHV_READ_LOCK(&(tdvp)->v_bh);				\
+	rv = _VOP_(vop_link, tdvp)((tdvp)->v_fbhv,fvp,d,cr);		\
+	VN_BHV_READ_UNLOCK(&(tdvp)->v_bh);				\
+}
+#define VOP_RENAME(fvp,fnm,tdvp,tnm,cr,rv)				\
+{									\
+	VN_BHV_READ_LOCK(&(fvp)->v_bh);					\
+	rv = _VOP_(vop_rename, fvp)((fvp)->v_fbhv,fnm,tdvp,tnm,cr);	\
+	VN_BHV_READ_UNLOCK(&(fvp)->v_bh);				\
+}
+#define VOP_MKDIR(dp,d,vap,vpp,cr,rv)					\
+{									\
+	VN_BHV_READ_LOCK(&(dp)->v_bh);					\
+	rv = _VOP_(vop_mkdir, dp)((dp)->v_fbhv,d,vap,vpp,cr);		\
+	VN_BHV_READ_UNLOCK(&(dp)->v_bh);				\
+}
+#define	VOP_RMDIR(dp,d,cr,rv)	 					\
+{									\
+	VN_BHV_READ_LOCK(&(dp)->v_bh);					\
+	rv = _VOP_(vop_rmdir, dp)((dp)->v_fbhv,d,cr);			\
+	VN_BHV_READ_UNLOCK(&(dp)->v_bh);				\
+}
+#define VOP_READDIR(vp,uiop,cr,eofp,rv)					\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_readdir, vp)((vp)->v_fbhv,uiop,cr,eofp);		\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_SYMLINK(dvp,d,vap,tnm,vpp,cr,rv)				\
+{									\
+	VN_BHV_READ_LOCK(&(dvp)->v_bh);					\
+	rv = _VOP_(vop_symlink, dvp) ((dvp)->v_fbhv,d,vap,tnm,vpp,cr); \
+	VN_BHV_READ_UNLOCK(&(dvp)->v_bh);				\
+}
+#define VOP_READLINK(vp,uiop,cr,rv)					\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_readlink, vp)((vp)->v_fbhv,uiop,cr);		\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_FSYNC(vp,f,cr,b,e,rv)					\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_fsync, vp)((vp)->v_fbhv,f,cr,b,e);		\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_INACTIVE(vp, cr, rv)					\
+{	/* vnode not reference-able, so no need to lock chain */	\
+	rv = _VOP_(vop_inactive, vp)((vp)->v_fbhv, cr);			\
+}
+#define VOP_RELEASE(vp, rv)						\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_release, vp)((vp)->v_fbhv);			\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_FID2(vp, fidp, rv)						\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_fid2, vp)((vp)->v_fbhv, fidp);			\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_RWLOCK(vp,i)						\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	(void)_VOP_(vop_rwlock, vp)((vp)->v_fbhv, i);			\
+	/* "allow" is done by rwunlock */				\
+}
+#define VOP_RWLOCK_TRY(vp,i)						\
+	_VOP_(vop_rwlock, vp)((vp)->v_fbhv, i)
+
+#define VOP_RWUNLOCK(vp,i)						\
+{	/* "prevent" was done by rwlock */				\
+	(void)_VOP_(vop_rwunlock, vp)((vp)->v_fbhv, i);			\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_RECLAIM(vp, rv)						\
+{	/* vnode not reference-able, so no need to lock chain */	\
+	rv = _VOP_(vop_reclaim, vp)((vp)->v_fbhv);			\
+}
+#define VOP_ATTR_GET(vp, name, val, vallenp, fl, cred, rv)		\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_attr_get, vp)((vp)->v_fbhv,name,val,vallenp,fl,cred); \
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_ATTR_SET(vp, name, val, vallen, fl, cred, rv)		\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_attr_set, vp)((vp)->v_fbhv,name,val,vallen,fl,cred); \
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_ATTR_REMOVE(vp, name, flags, cred, rv)			\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_attr_remove, vp)((vp)->v_fbhv,name,flags,cred);	\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_ATTR_LIST(vp, buf, buflen, fl, cursor, cred, rv)		\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_attr_list, vp)((vp)->v_fbhv,buf,buflen,fl,cursor,cred);\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_LINK_REMOVED(vp, dvp, linkzero)				\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	(void)_VOP_(vop_link_removed, vp)((vp)->v_fbhv, dvp, linkzero); \
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_VNODE_CHANGE(vp, cmd, val)					\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	(void)_VOP_(vop_vnode_change, vp)((vp)->v_fbhv,cmd,val);	\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+/*
+ * These are page cache functions that now go thru VOPs.
+ * 'last' parameter is unused and left in for IRIX compatibility
+ */
+#define VOP_TOSS_PAGES(vp, first, last, fiopt)				\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	_VOP_(vop_tosspages, vp)((vp)->v_fbhv,first, last, fiopt);	\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+/*
+ * 'last' parameter is unused and left in for IRIX compatibility
+ */
+#define VOP_FLUSHINVAL_PAGES(vp, first, last, fiopt)			\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	_VOP_(vop_flushinval_pages, vp)((vp)->v_fbhv,first,last,fiopt); \
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+/*
+ * 'last' parameter is unused and left in for IRIX compatibility
+ */
+#define VOP_FLUSH_PAGES(vp, first, last, flags, fiopt, rv)		\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_flush_pages, vp)((vp)->v_fbhv,first,last,flags,fiopt);\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_IOCTL(vp, inode, filp, cmd, arg, rv)			\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_ioctl, vp)((vp)->v_fbhv,inode,filp,cmd,arg);	\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+#define VOP_IFLUSH(vp, flags, rv)					\
+{									\
+	VN_BHV_READ_LOCK(&(vp)->v_bh);					\
+	rv = _VOP_(vop_iflush, vp)((vp)->v_fbhv, flags);		\
+	VN_BHV_READ_UNLOCK(&(vp)->v_bh);				\
+}
+
+/*
+ * Flags for VOP_IFLUSH call
+ */
+
+#define FLUSH_SYNC		1	/* wait for flush to complete	*/
+#define FLUSH_INODE		2	/* flush the inode itself	*/
+#define FLUSH_LOG		4	/* force the last log entry for
+					 * this inode out to disk	*/
+
+/*
+ * Flush/Invalidate options for VOP_TOSS_PAGES, VOP_FLUSHINVAL_PAGES and
+ *	VOP_FLUSH_PAGES.
+ */
+#define FI_NONE			0	/* none */
+#define FI_REMAPF		1	/* Do a remapf prior to the operation */
+#define FI_REMAPF_LOCKED	2	/* Do a remapf prior to the operation.
+					   Prevent VM access to the pages until
+					   the operation completes. */
+
+/*
+ * Vnode attributes.  va_mask indicates those attributes the caller
+ * wants to set (setattr) or extract (getattr).
+ */
+typedef struct vattr {
+	int		va_mask;	/* bit-mask of attributes */
+	vtype_t		va_type;	/* vnode type (for create) */
+	mode_t		va_mode;	/* file access mode */
+	uid_t		va_uid;		/* owner user id */
+	gid_t		va_gid;		/* owner group id */
+	dev_t		va_fsid;	/* file system id (dev for now) */
+	xfs_ino_t	va_nodeid;	/* node id */
+	nlink_t		va_nlink;	/* number of references to file */
+	xfs_off_t	va_size;	/* file size in bytes */
+	timespec_t	va_atime;	/* time of last access */
+	timespec_t	va_mtime;	/* time of last modification */
+	timespec_t	va_ctime;	/* time file ``created'' */
+	dev_t		va_rdev;	/* device the file represents */
+	u_long		va_blksize;	/* fundamental block size */
+	__int64_t	va_nblocks;	/* # of blocks allocated */
+	u_long		va_vcode;	/* version code */
+	u_long		va_xflags;	/* random extended file flags */
+	u_long		va_extsize;	/* file extent size */
+	u_long		va_nextents;	/* number of extents in file */
+	u_long		va_anextents;	/* number of attr extents in file */
+	int		va_projid;	/* project id */
+	u_int		va_gencount;	/* object generation count */
+} vattr_t;
+
+/*
+ * setattr or getattr attributes
+ */
+#define AT_TYPE		0x00000001
+#define AT_MODE		0x00000002
+#define AT_UID		0x00000004
+#define AT_GID		0x00000008
+#define AT_FSID		0x00000010
+#define AT_NODEID	0x00000020
+#define AT_NLINK	0x00000040
+#define AT_SIZE		0x00000080
+#define AT_ATIME	0x00000100
+#define AT_MTIME	0x00000200
+#define AT_CTIME	0x00000400
+#define AT_RDEV		0x00000800
+#define AT_BLKSIZE	0x00001000
+#define AT_NBLOCKS	0x00002000
+#define AT_VCODE	0x00004000
+#define AT_MAC		0x00008000
+#define AT_UPDATIME	0x00010000
+#define AT_UPDMTIME	0x00020000
+#define AT_UPDCTIME	0x00040000
+#define AT_ACL		0x00080000
+#define AT_CAP		0x00100000
+#define AT_INF		0x00200000
+#define AT_XFLAGS	0x00400000
+#define AT_EXTSIZE	0x00800000
+#define AT_NEXTENTS	0x01000000
+#define AT_ANEXTENTS	0x02000000
+#define AT_PROJID	0x04000000
+#define AT_SIZE_NOPERM	0x08000000
+#define AT_GENCOUNT	0x10000000
+
+#define AT_ALL	(AT_TYPE|AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|\
+		AT_NLINK|AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|AT_RDEV|\
+		AT_BLKSIZE|AT_NBLOCKS|AT_VCODE|AT_MAC|AT_ACL|AT_CAP|\
+		AT_INF|AT_XFLAGS|AT_EXTSIZE|AT_NEXTENTS|AT_ANEXTENTS|\
+		AT_PROJID|AT_GENCOUNT)
+
+#define AT_STAT (AT_TYPE|AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|AT_NLINK|\
+		AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|AT_RDEV|AT_BLKSIZE|\
+		AT_NBLOCKS|AT_PROJID)
+
+#define AT_TIMES (AT_ATIME|AT_MTIME|AT_CTIME)
+
+#define AT_UPDTIMES (AT_UPDATIME|AT_UPDMTIME|AT_UPDCTIME)
+
+#define AT_NOSET (AT_NLINK|AT_RDEV|AT_FSID|AT_NODEID|AT_TYPE|\
+		 AT_BLKSIZE|AT_NBLOCKS|AT_VCODE|AT_NEXTENTS|AT_ANEXTENTS|\
+		 AT_GENCOUNT)
+
+#define VREAD		00400
+#define VWRITE		00200
+#define VEXEC		00100
+#define VSGID		02000		/* set group id on execution */
+#define MODEMASK	07777		/* mode bits plus permission bits */
+
+/*
+ * Check whether mandatory file locking is enabled.
+ */
+#define MANDLOCK(vp, mode)	\
+	((vp)->v_type == VREG && ((mode) & (VSGID|(VEXEC>>3))) == VSGID)
+
+extern void	vn_init(void);
+extern int	vn_wait(struct vnode *);
+extern vnode_t	*vn_initialize(struct inode *);
+
+/*
+ * Acquiring and invalidating vnodes:
+ *
+ *	if (vn_get(vp, version, 0))
+ *		...;
+ *	vn_purge(vp, version);
+ *
+ * vn_get and vn_purge must be called with vmap_t arguments, sampled
+ * while a lock that the vnode's VOP_RECLAIM function acquires is
+ * held, to ensure that the vnode sampled with the lock held isn't
+ * recycled (VOP_RECLAIMed) or deallocated between the release of the lock
+ * and the subsequent vn_get or vn_purge.
+ */
+
+/*
+ * vnode_map structures _must_ match vn_epoch and vnode structure sizes.
+ */
+typedef struct vnode_map {
+	vfs_t		*v_vfsp;
+	vnumber_t	v_number;		/* in-core vnode number */
+	xfs_ino_t	v_ino;			/* inode #	*/
+} vmap_t;
+
+#define VMAP(vp, ip, vmap)	{(vmap).v_vfsp	 = (vp)->v_vfsp,	\
+				 (vmap).v_number = (vp)->v_number,	\
+				 (vmap).v_ino	 = (ip)->i_ino; }
+extern void	vn_purge(struct vnode *, vmap_t *);
+extern vnode_t	*vn_get(struct vnode *, vmap_t *);
+extern int	vn_revalidate(struct vnode *, int);
+extern void	vn_remove(struct vnode *);
+
+static inline int vn_count(struct vnode *vp)
+{
+	return atomic_read(&LINVFS_GET_IP(vp)->i_count);
+}
+
+/*
+ * Vnode reference counting functions (and macros for compatibility).
+ */
+extern vnode_t	*vn_hold(struct vnode *);
+extern void	vn_rele(struct vnode *);
+
+#if defined(CONFIG_XFS_VNODE_TRACING)
+
+#define VN_HOLD(vp)		\
+	((void)vn_hold(vp), \
+	  vn_trace_hold(vp, __FILE__, __LINE__, (inst_t *)__return_address))
+#define VN_RELE(vp)		\
+	  (vn_trace_rele(vp, __FILE__, __LINE__, (inst_t *)__return_address), \
+	   iput(LINVFS_GET_IP(vp)))
+
+#else	/* ! (defined(CONFIG_XFS_VNODE_TRACING)) */
+
+#define VN_HOLD(vp)		((void)vn_hold(vp))
+#define VN_RELE(vp)		(iput(LINVFS_GET_IP(vp)))
+
+#endif	/* ! (defined(CONFIG_XFS_VNODE_TRACING) */
+
+/*
+ * Vnode spinlock manipulation.
+ */
+#define VN_FLAGSET(vp,b)	vn_flagset(vp,b)
+#define VN_FLAGCLR(vp,b)	vn_flagclr(vp,b)
+
+static __inline__ void vn_flagset(struct vnode *vp, uint flag)
+{
+	spin_lock(&vp->v_lock);
+	vp->v_flag |= flag;
+	spin_unlock(&vp->v_lock);
+}
+
+static __inline__ void vn_flagclr(struct vnode *vp, uint flag)
+{
+	spin_lock(&vp->v_lock);
+	vp->v_flag &= ~flag;
+	spin_unlock(&vp->v_lock);
+}
+
+/*
+ * Some useful predicates.
+ */
+#define VN_MAPPED(vp)	\
+	(!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap)) || \
+	(!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap_shared))))
+#define VN_CACHED(vp)	(LINVFS_GET_IP(vp)->i_mapping->nrpages)
+#define VN_DIRTY(vp)	(!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->dirty_pages)))
+#define VMODIFY(vp)	{ VN_FLAGSET(vp, VMODIFIED); \
+			mark_inode_dirty(LINVFS_GET_IP(vp)); }
+#define VUNMODIFY(vp)	VN_FLAGCLR(vp, VMODIFIED)
+
+/*
+ * Flags to VOP_SETATTR/VOP_GETATTR.
+ */
+#define ATTR_UTIME	0x01	/* non-default utime(2) request */
+#define ATTR_EXEC	0x02	/* invocation from exec(2) */
+#define ATTR_COMM	0x04	/* yield common vp attributes */
+#define ATTR_DMI	0x08	/* invocation from a DMI function */
+#define ATTR_LAZY	0x80	/* set/get attributes lazily */
+#define ATTR_NONBLOCK	0x100	/* return EAGAIN if operation would block */
+#define ATTR_NOLOCK	0x200	/* Don't grab any conflicting locks */
+#define ATTR_NOSIZETOK	0x400	/* Don't get the DVN_SIZE_READ token */
+
+/*
+ * Flags to VOP_FSYNC and VOP_RECLAIM.
+ */
+#define FSYNC_NOWAIT	0	/* asynchronous flush */
+#define FSYNC_WAIT	0x1	/* synchronous fsync or forced reclaim */
+#define FSYNC_INVAL	0x2	/* flush and invalidate cached data */
+#define FSYNC_DATA	0x4	/* synchronous fsync of data only */
+
+#if (defined(CONFIG_XFS_VNODE_TRACING))
+
+#define VNODE_TRACE_SIZE	16		/* number of trace entries */
+
+/*
+ * Tracing entries.
+ */
+#define VNODE_KTRACE_ENTRY	1
+#define VNODE_KTRACE_EXIT	2
+#define VNODE_KTRACE_HOLD	3
+#define VNODE_KTRACE_REF	4
+#define VNODE_KTRACE_RELE	5
+
+extern void vn_trace_entry(struct vnode *, char *, inst_t *);
+extern void vn_trace_exit(struct vnode *, char *, inst_t *);
+extern void vn_trace_hold(struct vnode *, char *, int, inst_t *);
+extern void vn_trace_ref(struct vnode *, char *, int, inst_t *);
+extern void vn_trace_rele(struct vnode *, char *, int, inst_t *);
+#define VN_TRACE(vp)		\
+	vn_trace_ref(vp, __FILE__, __LINE__, (inst_t *)__return_address)
+
+#else	/* ! (defined(CONFIG_XFS_VNODE_TRACING)) */
+
+#define vn_trace_entry(a,b,c)
+#define vn_trace_exit(a,b,c)
+#define vn_trace_hold(a,b,c,d)
+#define vn_trace_ref(a,b,c,d)
+#define vn_trace_rele(a,b,c,d)
+#define VN_TRACE(vp)
+
+#endif	/* ! (defined(CONFIG_XFS_VNODE_TRACING)) */
+
+#endif	/* __XFS_VNODE_H__ */
diff --git a/fs/xfs/pagebuf/page_buf.c b/fs/xfs/pagebuf/page_buf.c
new file mode 100644
index 000000000000..b2cb9e20f7a4
--- /dev/null
+++ b/fs/xfs/pagebuf/page_buf.c
@@ -0,0 +1,2154 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ *	page_buf.c
+ *
+ *	The page_buf module provides an abstract buffer cache model on top of
+ *	the Linux page cache.  Cached blocks for a file are hashed to the
+ *	inode for that file, and can be held dirty in delayed write mode in
+ *	the page cache.	 Cached metadata blocks for a file system are hashed
+ *	to the inode for the mounted device.  The page_buf module assembles
+ *	buffer (page_buf_t) objects on demand to aggregate such cached pages
+ *	for I/O.
+ *
+ *
+ *	Written by Steve Lord, Jim Mostek, Russell Cattelan
+ *		    and Rajagopal Ananthanarayanan ("ananth") at SGI.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/stddef.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <asm/softirq.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+
+#include <support/kmem.h>
+#include "page_buf_internal.h"
+
+#define SECTOR_SHIFT	9
+#define SECTOR_SIZE	(1<<SECTOR_SHIFT)
+#define SECTOR_MASK	(SECTOR_SIZE - 1)
+#define BN_ALIGN_MASK	((1 << (PAGE_CACHE_SHIFT - SECTOR_SHIFT)) - 1)
+
+#ifndef GFP_READAHEAD
+#define GFP_READAHEAD	0
+#endif
+
+/*
+ * Debug code
+ */
+
+#ifdef PAGEBUF_TRACE
+static	spinlock_t		pb_trace_lock = SPIN_LOCK_UNLOCKED;
+struct pagebuf_trace_buf	pb_trace;
+EXPORT_SYMBOL(pb_trace);
+EXPORT_SYMBOL(pb_trace_func);
+#define CIRC_INC(i)	(((i) + 1) & (PB_TRACE_BUFSIZE - 1))
+
+void
+pb_trace_func(
+	page_buf_t	*pb,
+	int		event,
+	void		*misc,
+	void		*ra)
+{
+	int		j;
+	unsigned long	flags;
+
+	if (!pb_params.p_un.debug) return;
+
+	if (ra == NULL) ra = (void *)__builtin_return_address(0);
+
+	spin_lock_irqsave(&pb_trace_lock, flags);
+	j = pb_trace.start;
+	pb_trace.start = CIRC_INC(j);
+	spin_unlock_irqrestore(&pb_trace_lock, flags);
+
+	pb_trace.buf[j].pb = (unsigned long) pb;
+	pb_trace.buf[j].event = event;
+	pb_trace.buf[j].flags = pb->pb_flags;
+	pb_trace.buf[j].hold = pb->pb_hold.counter;
+	pb_trace.buf[j].lock_value = PBP(pb)->pb_sema.count.counter;
+	pb_trace.buf[j].task = (void *)current;
+	pb_trace.buf[j].misc = misc;
+	pb_trace.buf[j].ra = ra;
+	pb_trace.buf[j].offset = pb->pb_file_offset;
+	pb_trace.buf[j].size = pb->pb_buffer_length;
+}
+#endif	/* PAGEBUF_TRACE */
+
+#ifdef PAGEBUF_TRACKING
+#define MAX_PB	10000
+page_buf_t	*pb_array[MAX_PB];
+EXPORT_SYMBOL(pb_array);
+
+void
+pb_tracking_get(
+	page_buf_t	*pb)
+{
+	int		i;
+
+	for (i = 0; (pb_array[i] != 0) && (i < MAX_PB); i++) { }
+	if (i == MAX_PB)
+		printk("pb 0x%p not recorded in pb_array\n", pb);
+	else {
+		//printk("pb_get 0x%p in pb_array[%d]\n", pb, i);
+		pb_array[i] = pb;
+	}
+}
+
+void
+pb_tracking_free(
+	page_buf_t	*pb)
+{
+	int		i;
+
+	for (i = 0; (pb_array[i] != pb) && (i < MAX_PB); i++) { }
+	if (i < MAX_PB) {
+		//printk("pb_free 0x%p from pb_array[%d]\n", pb, i);
+		pb_array[i] = NULL;
+	}
+	else
+		printk("Freed unmonitored pagebuf 0x%p\n", pb);
+}
+#else
+#define pb_tracking_get(pb)	do { } while (0)
+#define pb_tracking_free(pb)	do { } while (0)
+#endif	/* PAGEBUF_TRACKING */
+
+/*
+ *	File wide globals
+ */
+
+STATIC kmem_cache_t *pagebuf_cache;
+STATIC pagebuf_daemon_t *pb_daemon;
+STATIC struct list_head pagebuf_iodone_tq[NR_CPUS];
+STATIC wait_queue_head_t pagebuf_iodone_wait[NR_CPUS];
+STATIC void pagebuf_daemon_wakeup(int);
+
+/*
+ * Pagebuf module configuration parameters, exported via
+ * /proc/sys/vm/pagebuf
+ */
+
+unsigned long pagebuf_min[P_PARAM] = {	HZ/2,	1*HZ, 0, 0 };
+unsigned long pagebuf_max[P_PARAM] = { HZ*30, HZ*300, 1, 1 };
+
+pagebuf_param_t pb_params = {{ HZ, 15 * HZ, 0, 0 }};
+
+/*
+ * Pagebuf statistics variables
+ */
+
+struct pbstats pbstats;
+
+/*
+ * Pagebuf allocation / freeing.
+ */
+
+#define pb_to_gfp(flags) \
+	(((flags) & PBF_READ_AHEAD) ? GFP_READAHEAD : \
+	 ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL)
+
+#define pagebuf_allocate(flags) \
+	kmem_cache_alloc(pagebuf_cache, pb_to_gfp(flags))
+#define pagebuf_deallocate(pb) \
+	kmem_cache_free(pagebuf_cache, (pb));
+
+/*
+ * Pagebuf hashing
+ */
+
+#define NBITS	5
+#define NHASH	(1<<NBITS)
+
+typedef struct {
+	struct list_head	pb_hash;
+	int			pb_count;
+	spinlock_t		pb_hash_lock;
+} pb_hash_t;
+
+STATIC pb_hash_t	pbhash[NHASH];
+#define pb_hash(pb)	&pbhash[pb->pb_hash_index]
+
+STATIC int
+_bhash(
+	dev_t		dev,
+	loff_t		base)
+{
+	int		bit, hval;
+
+	base >>= 9;
+	/*
+	 * dev_t is 16 bits, loff_t is always 64 bits
+	 */
+	base ^= dev;
+	for (bit = hval = 0; base != 0 && bit < sizeof(base) * 8; bit += NBITS) {
+		hval ^= (int)base & (NHASH-1);
+		base >>= NBITS;
+	}
+	return hval;
+}
+
+/*
+ * Mapping of multi-page buffers into contingous virtual space
+ */
+
+STATIC void *pagebuf_mapout_locked(page_buf_t *);
+
+STATIC	spinlock_t		as_lock = SPIN_LOCK_UNLOCKED;
+typedef struct a_list {
+	void	*vm_addr;
+	struct a_list	*next;
+} a_list_t;
+STATIC	a_list_t	*as_free_head;
+STATIC	int		as_list_len;
+
+STATIC void
+free_address(
+	void		*addr)
+{
+	a_list_t	*aentry;
+
+	spin_lock(&as_lock);
+	aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC);
+	aentry->next = as_free_head;
+	aentry->vm_addr = addr;
+	as_free_head = aentry;
+	as_list_len++;
+	spin_unlock(&as_lock);
+}
+
+STATIC void
+purge_addresses(void)
+{
+	a_list_t	*aentry, *old;
+
+	if (as_free_head == NULL) return;
+
+	spin_lock(&as_lock);
+	aentry = as_free_head;
+	as_free_head = NULL;
+	as_list_len = 0;
+	spin_unlock(&as_lock);
+
+	while ((old = aentry) != NULL) {
+		vunmap(aentry->vm_addr);
+		aentry = aentry->next;
+		kfree(old);
+	}
+}
+
+/*
+ *	Locking model:
+ *
+ *	Buffers associated with inodes for which buffer locking
+ *	is not enabled are not protected by semaphores, and are
+ *	assumed to be exclusively owned by the caller.	There is
+ *	spinlock in the buffer, for use by the caller when concurrent
+ *	access is possible.
+ */
+
+/*
+ *	Internal pagebuf object manipulation
+ */
+
+STATIC void
+_pagebuf_initialize(
+	page_buf_t		*pb,
+	pb_target_t		*target,
+	loff_t			range_base,
+	size_t			range_length,
+	page_buf_flags_t	flags)
+{
+	/*
+	 * We don't want certain flags to appear in pb->pb_flags.
+	 */
+	flags &= ~(PBF_LOCK|PBF_ENTER_PAGES|PBF_MAPPED);
+	flags &= ~(PBF_DONT_BLOCK|PBF_READ_AHEAD);
+
+	pb_tracking_get(pb);
+
+	memset(pb, 0, sizeof(page_buf_private_t));
+	atomic_set(&pb->pb_hold, 1);
+	init_MUTEX_LOCKED(&pb->pb_iodonesema);
+	INIT_LIST_HEAD(&pb->pb_list);
+	INIT_LIST_HEAD(&pb->pb_hash_list);
+	init_MUTEX_LOCKED(&PBP(pb)->pb_sema); /* held, no waiters */
+	PB_SET_OWNER(pb);
+	pb->pb_target = target;
+	pb->pb_file_offset = range_base;
+	/*
+	 * Set buffer_length and count_desired to the same value initially.
+	 * IO routines should use count_desired, which will be the same in
+	 * most cases but may be reset (e.g. XFS recovery).
+	 */
+	pb->pb_buffer_length = pb->pb_count_desired = range_length;
+	pb->pb_flags = flags | PBF_NONE;
+	pb->pb_bn = PAGE_BUF_DADDR_NULL;
+	atomic_set(&PBP(pb)->pb_pin_count, 0);
+	init_waitqueue_head(&PBP(pb)->pb_waiters);
+
+	PB_STATS_INC(pbstats.pb_create);
+	PB_TRACE(pb, PB_TRACE_REC(get), target);
+}
+
+/*
+ * Allocate a page array capable of holding a specified number
+ * of pages, and point the page buf at it.
+ */
+STATIC int
+_pagebuf_get_pages(
+	page_buf_t		*pb,
+	int			page_count,
+	int			flags)
+{
+	int			gpf_mask = pb_to_gfp(flags);
+
+	/* Make sure that we have a page list */
+	if (pb->pb_pages == NULL) {
+		pb->pb_offset = page_buf_poff(pb->pb_file_offset);
+		pb->pb_page_count = page_count;
+		if (page_count <= PB_PAGES) {
+			pb->pb_pages = pb->pb_page_array;
+		} else {
+			pb->pb_pages = kmalloc(sizeof(struct page *) *
+					page_count, gpf_mask);
+			if (pb->pb_pages == NULL)
+				return -ENOMEM;
+		}
+		memset(pb->pb_pages, 0, sizeof(struct page *) * page_count);
+	}
+	return 0;
+}
+
+/*
+ * Walk a pagebuf releasing all the pages contained within it.
+ */
+STATIC inline void
+_pagebuf_freepages(
+	page_buf_t		*pb)
+{
+	int			buf_index;
+
+	for (buf_index = 0; buf_index < pb->pb_page_count; buf_index++) {
+		struct page	*page = pb->pb_pages[buf_index];
+
+		if (page) {
+			pb->pb_pages[buf_index] = NULL;
+			page_cache_release(page);
+		}
+	}
+
+	if (pb->pb_pages != pb->pb_page_array)
+		kfree(pb->pb_pages);
+}
+
+/*
+ *	_pagebuf_free_object
+ *
+ *	_pagebuf_free_object releases the contents specified buffer.
+ *	The modification state of any associated pages is left unchanged.
+ */
+void
+_pagebuf_free_object(
+	pb_hash_t	*hash,	/* hash bucket for buffer	*/
+	page_buf_t	*pb)	/* buffer to deallocate		*/
+{
+	int		pb_flags = pb->pb_flags;
+
+	PB_TRACE(pb, PB_TRACE_REC(free_obj), 0);
+	pb->pb_flags |= PBF_FREED;
+
+	if (hash) {
+		if (!list_empty(&pb->pb_hash_list)) {
+			hash->pb_count--;
+			list_del_init(&pb->pb_hash_list);
+		}
+		spin_unlock(&hash->pb_hash_lock);
+	}
+
+	if (!(pb_flags & PBF_FREED)) {
+		/* release any virtual mapping */ ;
+		if (pb->pb_flags & _PBF_ADDR_ALLOCATED) {
+			void *vaddr = pagebuf_mapout_locked(pb);
+			if (vaddr) {
+				free_address(vaddr);
+			}
+		}
+
+		if (pb->pb_flags & _PBF_MEM_ALLOCATED) {
+			if (pb->pb_pages) {
+				/* release the pages in the address list */
+				if (pb->pb_pages[0] &&
+				    PageSlab(pb->pb_pages[0])) {
+					/*
+					 * This came from the slab
+					 * allocator free it as such
+					 */
+					kfree(pb->pb_addr);
+				} else {
+					_pagebuf_freepages(pb);
+				}
+
+				pb->pb_pages = NULL;
+			}
+			pb->pb_flags &= ~_PBF_MEM_ALLOCATED;
+		}
+	}
+
+	pb_tracking_free(pb);
+	pagebuf_deallocate(pb);
+}
+
+/*
+ *	_pagebuf_lookup_pages
+ *
+ *	_pagebuf_lookup_pages finds all pages which match the buffer
+ *	in question and the range of file offsets supplied,
+ *	and builds the page list for the buffer, if the
+ *	page list is not already formed or if not all of the pages are
+ *	already in the list. Invalid pages (pages which have not yet been
+ *	read in from disk) are assigned for any pages which are not found.
+ */
+STATIC int
+_pagebuf_lookup_pages(
+	page_buf_t		*pb,
+	struct address_space	*aspace,
+	page_buf_flags_t	flags)
+{
+	loff_t			next_buffer_offset;
+	unsigned long		page_count, pi, index;
+	struct page		*page;
+	int			gfp_mask, retry_count = 5, rval = 0;
+	int			all_mapped, good_pages, nbytes;
+	size_t			blocksize, size, offset;
+
+
+	/* For pagebufs where we want to map an address, do not use
+	 * highmem pages - so that we do not need to use kmap resources
+	 * to access the data.
+	 *
+	 * For pages where the caller has indicated there may be resource
+	 * contention (e.g. called from a transaction) do not flush
+	 * delalloc pages to obtain memory.
+	 */
+
+	if (flags & PBF_READ_AHEAD) {
+		gfp_mask = GFP_READAHEAD;
+		retry_count = 0;
+	} else if (flags & PBF_DONT_BLOCK) {
+		gfp_mask = GFP_NOFS;
+	} else if (flags & PBF_MAPPABLE) {
+		gfp_mask = GFP_KERNEL;
+	} else {
+		gfp_mask = GFP_HIGHUSER;
+	}
+
+	next_buffer_offset = pb->pb_file_offset + pb->pb_buffer_length;
+
+	good_pages = page_count = (page_buf_btoc(next_buffer_offset) -
+				   page_buf_btoct(pb->pb_file_offset));
+
+	if (pb->pb_flags & _PBF_ALL_PAGES_MAPPED) {
+		/* Bring pages forward in cache */
+		for (pi = 0; pi < page_count; pi++) {
+			mark_page_accessed(pb->pb_pages[pi]);
+		}
+		if ((flags & PBF_MAPPED) && !(pb->pb_flags & PBF_MAPPED)) {
+			all_mapped = 1;
+			goto mapit;
+		}
+		return 0;
+	}
+
+	/* Ensure pb_pages field has been initialised */
+	rval = _pagebuf_get_pages(pb, page_count, flags);
+	if (rval)
+		return rval;
+
+	rval = pi = 0;
+	blocksize = pb->pb_target->pbr_blocksize;
+	size = pb->pb_count_desired;
+	offset = pb->pb_offset;
+
+	/* Enter the pages in the page list */
+	index = (pb->pb_file_offset - pb->pb_offset) >> PAGE_CACHE_SHIFT;
+	for (all_mapped = 1; pi < page_count; pi++, index++) {
+		if (pb->pb_pages[pi] == 0) {
+		      retry:
+			page = find_or_create_page(aspace, index, gfp_mask);
+			if (!page) {
+				if (--retry_count > 0) {
+					PB_STATS_INC(pbstats.pb_page_retries);
+					pagebuf_daemon_wakeup(1);
+					current->state = TASK_UNINTERRUPTIBLE;
+					schedule_timeout(10);
+					goto retry;
+				}
+				rval = -ENOMEM;
+				all_mapped = 0;
+				continue;
+			}
+			PB_STATS_INC(pbstats.pb_page_found);
+			mark_page_accessed(page);
+			pb->pb_pages[pi] = page;
+		} else {
+			page = pb->pb_pages[pi];
+			lock_page(page);
+		}
+
+		nbytes = PAGE_CACHE_SIZE - offset;
+		if (nbytes > size)
+			nbytes = size;
+		size -= nbytes;
+
+		if (!PageUptodate(page)) {
+			if ((blocksize == PAGE_CACHE_SIZE) &&
+			    (flags & PBF_READ)) {
+				pb->pb_locked = 1;
+				good_pages--;
+			} else if (!PagePrivate(page)) {
+				unsigned long i, range = (offset + nbytes) >> SECTOR_SHIFT;
+
+				assert(blocksize < PAGE_CACHE_SIZE);
+				assert(!(pb->pb_flags & _PBF_PRIVATE_BH));
+				/*
+				 * In this case page->private holds a bitmap
+				 * of uptodate sectors (512) within the page
+				 */
+				for (i = offset >> SECTOR_SHIFT; i < range; i++)
+					if (!test_bit(i, &page->private))
+						break;
+				if (i != range)
+					good_pages--;
+			} else {
+				good_pages--;
+			}
+		}
+		offset = 0;
+	}
+
+	if (!pb->pb_locked) {
+		for (pi = 0; pi < page_count; pi++) {
+			unlock_page(pb->pb_pages[pi]);
+		}
+	}
+
+mapit:
+	pb->pb_flags |= _PBF_MEM_ALLOCATED;
+	if (all_mapped) {
+		pb->pb_flags |= _PBF_ALL_PAGES_MAPPED;
+
+		/* A single page buffer is always mappable */
+		if (page_count == 1) {
+			pb->pb_addr = (caddr_t)
+			    	page_address(pb->pb_pages[0]) + pb->pb_offset;
+			pb->pb_flags |= PBF_MAPPED;
+		} else if (flags & PBF_MAPPED) {
+			if (as_list_len > 64)
+				purge_addresses();
+			pb->pb_addr = vmap(pb->pb_pages, page_count);
+			if (!pb->pb_addr)
+				BUG();
+			pb->pb_addr += pb->pb_offset;
+			pb->pb_flags |= PBF_MAPPED | _PBF_ADDR_ALLOCATED;
+		}
+	}
+	/* If some pages were found with data in them
+	 * we are not in PBF_NONE state.
+	 */
+	if (good_pages != 0) {
+		pb->pb_flags &= ~(PBF_NONE);
+		if (good_pages != page_count) {
+			pb->pb_flags |= PBF_PARTIAL;
+		}
+	}
+
+	PB_TRACE(pb, PB_TRACE_REC(look_pg), good_pages);
+
+	return rval;
+}
+
+
+/*
+ *	Finding and Reading Buffers
+ */
+
+/*
+ *	_pagebuf_find
+ *
+ *	Looks up, and creates if absent, a lockable buffer for
+ *	a given range of an inode.  The buffer is returned
+ *	locked.	 If other overlapping buffers exist, they are
+ *	released before the new buffer is created and locked,
+ *	which may imply that this call will block until those buffers
+ *	are unlocked.  No I/O is implied by this call.
+ */
+STATIC page_buf_t *
+_pagebuf_find(				/* find buffer for block	*/
+	pb_target_t		*target,/* target for block		*/
+	loff_t			ioff,	/* starting offset of range	*/
+	size_t			isize,	/* length of range		*/
+	page_buf_flags_t	flags,	/* PBF_TRYLOCK			*/
+	page_buf_t		*new_pb)/* newly allocated buffer	*/
+{
+	loff_t			range_base;
+	size_t			range_length;
+	int			hval;
+	pb_hash_t		*h;
+	struct list_head	*p;
+	page_buf_t		*pb;
+	int			not_locked;
+
+	range_base = (ioff << SECTOR_SHIFT);
+	range_length = (isize << SECTOR_SHIFT);
+
+	hval = _bhash(target->pbr_bdev->bd_dev, range_base);
+	h = &pbhash[hval];
+
+	spin_lock(&h->pb_hash_lock);
+	list_for_each(p, &h->pb_hash) {
+		pb = list_entry(p, page_buf_t, pb_hash_list);
+
+		if ((target == pb->pb_target) &&
+		    (pb->pb_file_offset == range_base) &&
+		    (pb->pb_buffer_length == range_length)) {
+			if (pb->pb_flags & PBF_FREED)
+				break;
+			/* If we look at something bring it to the
+			 * front of the list for next time
+			 */
+			list_del(&pb->pb_hash_list);
+			list_add(&pb->pb_hash_list, &h->pb_hash);
+			goto found;
+		}
+	}
+
+	/* No match found */
+	if (new_pb) {
+		_pagebuf_initialize(new_pb, target, range_base,
+				range_length, flags | _PBF_LOCKABLE);
+		new_pb->pb_hash_index = hval;
+		h->pb_count++;
+		list_add(&new_pb->pb_hash_list, &h->pb_hash);
+	} else {
+		PB_STATS_INC(pbstats.pb_miss_locked);
+	}
+
+	spin_unlock(&h->pb_hash_lock);
+	return (new_pb);
+
+found:
+	atomic_inc(&pb->pb_hold);
+	spin_unlock(&h->pb_hash_lock);
+
+	/* Attempt to get the semaphore without sleeping,
+	 * if this does not work then we need to drop the
+	 * spinlock and do a hard attempt on the semaphore.
+	 */
+	not_locked = down_trylock(&PBP(pb)->pb_sema);
+	if (not_locked) {
+		if (!(flags & PBF_TRYLOCK)) {
+			/* wait for buffer ownership */
+			PB_TRACE(pb, PB_TRACE_REC(get_lk), 0);
+			pagebuf_lock(pb);
+			PB_STATS_INC(pbstats.pb_get_locked_waited);
+		} else {
+			/* We asked for a trylock and failed, no need
+			 * to look at file offset and length here, we
+			 * know that this pagebuf at least overlaps our
+			 * pagebuf and is locked, therefore our buffer
+			 * either does not exist, or is this buffer
+			 */
+
+			pagebuf_rele(pb);
+			PB_STATS_INC(pbstats.pb_busy_locked);
+			return (NULL);
+		}
+	} else {
+		/* trylock worked */
+		PB_SET_OWNER(pb);
+	}
+
+	if (pb->pb_flags & PBF_STALE)
+		pb->pb_flags &= PBF_MAPPABLE | \
+				PBF_MAPPED | \
+				_PBF_LOCKABLE | \
+				_PBF_ALL_PAGES_MAPPED | \
+				_PBF_SOME_INVALID_PAGES | \
+				_PBF_ADDR_ALLOCATED | \
+				_PBF_MEM_ALLOCATED;
+	PB_TRACE(pb, PB_TRACE_REC(got_lk), 0);
+	PB_STATS_INC(pbstats.pb_get_locked);
+	return (pb);
+}
+
+
+/*
+ *	pagebuf_find
+ *
+ *	pagebuf_find returns a buffer matching the specified range of
+ *	data for the specified target, if any of the relevant blocks
+ *	are in memory.	The buffer may have unallocated holes, if
+ *	some, but not all, of the blocks are in memory.	 Even where
+ *	pages are present in the buffer, not all of every page may be
+ *	valid.	The file system may use pagebuf_segment to visit the
+ *	various segments of the buffer.
+ */
+page_buf_t *
+pagebuf_find(				/* find buffer for block	*/
+					/* if the block is in memory	*/
+	pb_target_t		*target,/* target for block		*/
+	loff_t			ioff,	/* starting offset of range	*/
+	size_t			isize,	/* length of range		*/
+	page_buf_flags_t	flags)	/* PBF_TRYLOCK			*/
+{
+	return _pagebuf_find(target, ioff, isize, flags, NULL);
+}
+
+/*
+ *	pagebuf_get
+ *
+ *	pagebuf_get assembles a buffer covering the specified range.
+ *	Some or all of the blocks in the range may be valid.  The file
+ *	system may use pagebuf_segment to visit the various segments
+ *	of the buffer.	Storage in memory for all portions of the
+ *	buffer will be allocated, although backing storage may not be.
+ *	If PBF_READ is set in flags, pagebuf_read
+ */
+page_buf_t *
+pagebuf_get(				/* allocate a buffer		*/
+	pb_target_t		*target,/* target for buffer 		*/
+	loff_t			ioff,	/* starting offset of range	*/
+	size_t			isize,	/* length of range		*/
+	page_buf_flags_t	flags)	/* PBF_TRYLOCK			*/
+{
+	page_buf_t		*pb, *new_pb;
+	int			error;
+
+	new_pb = pagebuf_allocate(flags);
+	if (unlikely(!new_pb))
+		return (NULL);
+
+	pb = _pagebuf_find(target, ioff, isize, flags, new_pb);
+	if (pb != new_pb) {
+		pagebuf_deallocate(new_pb);
+		if (unlikely(!pb))
+			return (NULL);
+	}
+
+	PB_STATS_INC(pbstats.pb_get);
+
+	/* fill in any missing pages */
+	error = _pagebuf_lookup_pages(pb, pb->pb_target->pbr_mapping, flags);
+	if (unlikely(error)) {
+		pagebuf_free(pb);
+		return (NULL);
+	}
+
+	/*
+	 * Always fill in the block number now, the mapped cases can do
+	 * their own overlay of this later.
+	 */
+	pb->pb_bn = ioff;
+	pb->pb_count_desired = pb->pb_buffer_length;
+
+	if (flags & PBF_READ) {
+		if (PBF_NOT_DONE(pb)) {
+			PB_TRACE(pb, PB_TRACE_REC(get_read), flags);
+			PB_STATS_INC(pbstats.pb_get_read);
+			pagebuf_iostart(pb, flags);
+		} else if (flags & PBF_ASYNC) {
+			/*
+			 * Read ahead call which is already satisfied,
+			 * drop the buffer
+			 */
+			if (flags & (PBF_LOCK | PBF_TRYLOCK))
+				pagebuf_unlock(pb);
+			pagebuf_rele(pb);
+			return NULL;
+		} else {
+			/* We do not want read in the flags */
+			pb->pb_flags &= ~PBF_READ;
+		}
+	}
+
+	PB_TRACE(pb, PB_TRACE_REC(get_obj), flags);
+	return (pb);
+}
+
+/*
+ * Create a pagebuf and populate it with pages from the address
+ * space of the passed in inode.
+ */
+page_buf_t *
+pagebuf_lookup(
+	struct pb_target	*target,
+	struct inode		*inode,
+	loff_t			ioff,
+	size_t			isize,
+	int			flags)
+{
+	page_buf_t		*pb = NULL;
+	int			status;
+
+	flags |= _PBF_PRIVATE_BH;
+	pb = pagebuf_allocate(flags);
+	if (pb) {
+		_pagebuf_initialize(pb, target, ioff, isize, flags);
+		if (flags & PBF_ENTER_PAGES) {
+			status = _pagebuf_lookup_pages(pb, &inode->i_data, 0);
+			if (status != 0) {
+				pagebuf_free(pb);
+				return (NULL);
+			}
+		}
+	}
+	return pb;
+}
+
+/*
+ * If we are not low on memory then do the readahead in a deadlock
+ * safe manner.
+ */
+void
+pagebuf_readahead(
+	pb_target_t		*target,
+	loff_t			ioff,
+	size_t			isize,
+	int			flags)
+{
+	flags |= (PBF_TRYLOCK|PBF_READ|PBF_ASYNC|PBF_MAPPABLE|PBF_READ_AHEAD);
+	pagebuf_get(target, ioff, isize, flags);
+}
+
+page_buf_t *
+pagebuf_get_empty(
+	pb_target_t		*target)
+{
+	page_buf_t		*pb;
+
+	pb = pagebuf_allocate(_PBF_LOCKABLE);
+	if (pb)
+		_pagebuf_initialize(pb, target, 0, 0, _PBF_LOCKABLE);
+	return pb;
+}
+
+static inline struct page *
+mem_to_page(
+	void			*addr)
+{
+	if (((unsigned long)addr < VMALLOC_START) ||
+            ((unsigned long)addr >= VMALLOC_END)) {
+		return virt_to_page(addr);
+	} else {
+		return vmalloc_to_page(addr);
+	}
+}
+
+int
+pagebuf_associate_memory(
+	page_buf_t		*pb,
+	void			*mem,
+	size_t			len)
+{
+	int			rval;
+	int			i = 0;
+	size_t			ptr;
+	size_t			end, end_cur;
+	off_t			offset;
+	int			page_count;
+
+	page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
+	offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK);
+	if (offset && (len > PAGE_CACHE_SIZE))
+		page_count++;
+
+	/* Free any previous set of page pointers */
+	if (pb->pb_pages && (pb->pb_pages != pb->pb_page_array)) {
+		kfree(pb->pb_pages);
+	}
+	pb->pb_pages = NULL;
+	pb->pb_addr = mem;
+
+	rval = _pagebuf_get_pages(pb, page_count, 0);
+	if (rval)
+		return rval;
+
+	pb->pb_offset = offset;
+	ptr = (size_t) mem & PAGE_CACHE_MASK;
+	end = PAGE_CACHE_ALIGN((size_t) mem + len);
+	end_cur = end;
+	/* set up first page */
+	pb->pb_pages[0] = mem_to_page(mem);
+
+	ptr += PAGE_CACHE_SIZE;
+	pb->pb_page_count = ++i;
+	while (ptr < end) {
+		pb->pb_pages[i] = mem_to_page((void *)ptr);
+		pb->pb_page_count = ++i;
+		ptr += PAGE_CACHE_SIZE;
+	}
+	pb->pb_locked = 0;
+
+	pb->pb_count_desired = pb->pb_buffer_length = len;
+	pb->pb_flags |= PBF_MAPPED | _PBF_PRIVATE_BH;
+
+	return 0;
+}
+
+page_buf_t *
+pagebuf_get_no_daddr(
+	size_t			len,
+	pb_target_t		*target)
+{
+	int			rval;
+	void			*rmem = NULL;
+	int			flags = _PBF_LOCKABLE | PBF_FORCEIO;
+	page_buf_t		*pb;
+	size_t			tlen = 0;
+
+	if (len > 0x20000)
+		return(NULL);
+
+	pb = pagebuf_allocate(flags);
+	if (!pb)
+		return NULL;
+
+	_pagebuf_initialize(pb, target, 0, len, flags);
+
+	do {
+		if (tlen == 0) {
+			tlen = len; /* first time */
+		} else {
+			kfree(rmem); /* free the mem from the previous try */
+			tlen <<= 1; /* double the size and try again */
+			/*
+			printk(
+			"pb_get_no_daddr NOT block 0x%p mask 0x%p len %d\n",
+				rmem, ((size_t)rmem & (size_t)~SECTOR_MASK),
+				len);
+			*/
+		}
+		if ((rmem = kmalloc(tlen, GFP_KERNEL)) == 0) {
+			pagebuf_free(pb);
+			return NULL;
+		}
+	} while ((size_t)rmem != ((size_t)rmem & (size_t)~SECTOR_MASK));
+
+	if ((rval = pagebuf_associate_memory(pb, rmem, len)) != 0) {
+		kfree(rmem);
+		pagebuf_free(pb);
+		return NULL;
+	}
+	/* otherwise pagebuf_free just ignores it */
+	pb->pb_flags |= _PBF_MEM_ALLOCATED;
+	up(&PBP(pb)->pb_sema);	/* Return unlocked pagebuf */
+
+	PB_TRACE(pb, PB_TRACE_REC(no_daddr), rmem);
+
+	return pb;
+}
+
+
+/*
+ *	pagebuf_hold
+ *
+ *	Increment reference count on buffer, to hold the buffer concurrently
+ *	with another thread which may release (free) the buffer asynchronously.
+ *
+ *	Must hold the buffer already to call this function.
+ */
+void
+pagebuf_hold(
+	page_buf_t		*pb)
+{
+	atomic_inc(&pb->pb_hold);
+	PB_TRACE(pb, PB_TRACE_REC(hold), 0);
+}
+
+/*
+ *	pagebuf_free
+ *
+ *	pagebuf_free releases the specified buffer.  The modification
+ *	state of any associated pages is left unchanged.
+ */
+void
+pagebuf_free(
+	page_buf_t		*pb)
+{
+	if (pb->pb_flags & _PBF_LOCKABLE) {
+		pb_hash_t	*h = pb_hash(pb);
+
+		spin_lock(&h->pb_hash_lock);
+		_pagebuf_free_object(h, pb);
+	} else {
+		_pagebuf_free_object(NULL, pb);
+	}
+}
+
+/*
+ *	pagebuf_rele
+ *
+ *	pagebuf_rele releases a hold on the specified buffer.  If the
+ *	the hold count is 1, pagebuf_rele calls pagebuf_free.
+ */
+void
+pagebuf_rele(
+	page_buf_t		*pb)
+{
+	pb_hash_t		*h;
+
+	PB_TRACE(pb, PB_TRACE_REC(rele), pb->pb_relse);
+	if (pb->pb_flags & _PBF_LOCKABLE) {
+		h = pb_hash(pb);
+		spin_lock(&h->pb_hash_lock);
+	} else {
+		h = NULL;
+	}
+
+	if (atomic_dec_and_test(&pb->pb_hold)) {
+		int		do_free = 1;
+
+		if (pb->pb_relse) {
+			atomic_inc(&pb->pb_hold);
+			if (h)
+				spin_unlock(&h->pb_hash_lock);
+			(*(pb->pb_relse)) (pb);
+			do_free = 0;
+		}
+		if (pb->pb_flags & PBF_DELWRI) {
+			pb->pb_flags |= PBF_ASYNC;
+			atomic_inc(&pb->pb_hold);
+			if (h && do_free)
+				spin_unlock(&h->pb_hash_lock);
+			pagebuf_delwri_queue(pb, 0);
+			do_free = 0;
+		} else if (pb->pb_flags & PBF_FS_MANAGED) {
+			if (h)
+				spin_unlock(&h->pb_hash_lock);
+			do_free = 0;
+		}
+
+		if (do_free) {
+			_pagebuf_free_object(h, pb);
+		}
+	} else if (h) {
+		spin_unlock(&h->pb_hash_lock);
+	}
+}
+
+
+/*
+ *	Pinning Buffer Storage in Memory
+ */
+
+/*
+ *	pagebuf_pin
+ *
+ *	pagebuf_pin locks all of the memory represented by a buffer in
+ *	memory.	 Multiple calls to pagebuf_pin and pagebuf_unpin, for
+ *	the same or different buffers affecting a given page, will
+ *	properly count the number of outstanding "pin" requests.  The
+ *	buffer may be released after the pagebuf_pin and a different
+ *	buffer used when calling pagebuf_unpin, if desired.
+ *	pagebuf_pin should be used by the file system when it wants be
+ *	assured that no attempt will be made to force the affected
+ *	memory to disk.	 It does not assure that a given logical page
+ *	will not be moved to a different physical page.
+ */
+void
+pagebuf_pin(
+	page_buf_t		*pb)
+{
+	atomic_inc(&PBP(pb)->pb_pin_count);
+	PB_TRACE(pb, PB_TRACE_REC(pin), PBP(pb)->pb_pin_count.counter);
+}
+
+/*
+ *	pagebuf_unpin
+ *
+ *	pagebuf_unpin reverses the locking of memory performed by
+ *	pagebuf_pin.  Note that both functions affected the logical
+ *	pages associated with the buffer, not the buffer itself.
+ */
+void
+pagebuf_unpin(
+	page_buf_t		*pb)
+{
+	if (atomic_dec_and_test(&PBP(pb)->pb_pin_count)) {
+		wake_up_all(&PBP(pb)->pb_waiters);
+	}
+	PB_TRACE(pb, PB_TRACE_REC(unpin), PBP(pb)->pb_pin_count.counter);
+}
+
+int
+pagebuf_ispin(
+	page_buf_t		*pb)
+{
+	return atomic_read(&PBP(pb)->pb_pin_count);
+}
+
+/*
+ *	pagebuf_wait_unpin
+ *
+ *	pagebuf_wait_unpin waits until all of the memory associated
+ *	with the buffer is not longer locked in memory.	 It returns
+ *	immediately if none of the affected pages are locked.
+ */
+static inline void
+_pagebuf_wait_unpin(
+	page_buf_t		*pb)
+{
+	DECLARE_WAITQUEUE	(wait, current);
+
+	if (atomic_read(&PBP(pb)->pb_pin_count) == 0)
+		return;
+
+	add_wait_queue(&PBP(pb)->pb_waiters, &wait);
+	for (;;) {
+		current->state = TASK_UNINTERRUPTIBLE;
+		if (atomic_read(&PBP(pb)->pb_pin_count) == 0) {
+			break;
+		}
+		blk_run_queues();
+		schedule();
+	}
+	remove_wait_queue(&PBP(pb)->pb_waiters, &wait);
+	current->state = TASK_RUNNING;
+}
+
+void
+pagebuf_queue_task(
+	struct tq_struct	*task)
+{
+	queue_task(task, &pagebuf_iodone_tq[smp_processor_id()]);
+	wake_up(&pagebuf_iodone_wait[smp_processor_id()]);
+}
+
+
+/*
+ *	Buffer Utility Routines
+ */
+
+/*
+ *	pagebuf_iodone
+ *
+ *	pagebuf_iodone marks a buffer for which I/O is in progress
+ *	done with respect to that I/O.	The pb_done routine, if
+ *	present, will be called as a side-effect.
+ */
+void
+pagebuf_iodone_sched(
+	void			*v)
+{
+	page_buf_t		*pb = (page_buf_t *)v;
+
+	if (pb->pb_iodone) {
+		(*(pb->pb_iodone)) (pb);
+		return;
+	}
+
+	if (pb->pb_flags & PBF_ASYNC) {
+		if ((pb->pb_flags & _PBF_LOCKABLE) && !pb->pb_relse)
+			pagebuf_unlock(pb);
+		pagebuf_rele(pb);
+	}
+}
+
+void
+pagebuf_iodone(
+	page_buf_t		*pb)
+{
+	pb->pb_flags &= ~(PBF_READ | PBF_WRITE);
+	if (pb->pb_error == 0) {
+		pb->pb_flags &= ~(PBF_PARTIAL | PBF_NONE);
+	}
+
+	PB_TRACE(pb, PB_TRACE_REC(done), pb->pb_iodone);
+
+	if ((pb->pb_iodone) || (pb->pb_flags & PBF_ASYNC)) {
+		INIT_TQUEUE(&pb->pb_iodone_sched,
+			pagebuf_iodone_sched, (void *)pb);
+
+		queue_task(&pb->pb_iodone_sched,
+				&pagebuf_iodone_tq[smp_processor_id()]);
+		wake_up(&pagebuf_iodone_wait[smp_processor_id()]);
+	} else {
+		up(&pb->pb_iodonesema);
+	}
+}
+
+/*
+ *	pagebuf_ioerror
+ *
+ *	pagebuf_ioerror sets the error code for a buffer.
+ */
+void
+pagebuf_ioerror(			/* mark/clear buffer error flag */
+	page_buf_t		*pb,	/* buffer to mark		*/
+	unsigned int		error)	/* error to store (0 if none)	*/
+{
+	pb->pb_error = error;
+	PB_TRACE(pb, PB_TRACE_REC(ioerror), error);
+}
+
+/*
+ *	pagebuf_iostart
+ *
+ *	pagebuf_iostart initiates I/O on a buffer, based on the flags supplied.
+ *	If necessary, it will arrange for any disk space allocation required,
+ *	and it will break up the request if the block mappings require it.
+ *	An pb_iodone routine in the buffer supplied will only be called
+ *	when all of the subsidiary I/O requests, if any, have been completed.
+ *	pagebuf_iostart calls the pagebuf_ioinitiate routine or
+ *	pagebuf_iorequest, if the former routine is not defined, to start
+ *	the I/O on a given low-level request.
+ */
+int
+pagebuf_iostart(			/* start I/O on a buffer	  */
+	page_buf_t		*pb,	/* buffer to start		  */
+	page_buf_flags_t	flags)	/* PBF_LOCK, PBF_ASYNC, PBF_READ, */
+					/* PBF_WRITE, PBF_ALLOCATE,	  */
+					/* PBF_DELWRI, 			  */
+					/* PBF_SYNC, PBF_DONT_BLOCK	  */
+					/* PBF_RELEASE			  */
+{
+	int			status = 0;
+
+	PB_TRACE(pb, PB_TRACE_REC(iostart), flags);
+
+	if (flags & PBF_DELWRI) {
+		pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC);
+		pb->pb_flags |= flags &
+				(PBF_DELWRI | PBF_ASYNC | PBF_SYNC);
+		pagebuf_delwri_queue(pb, 1);
+		return status;
+	}
+
+	pb->pb_flags &= ~(PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_DELWRI|PBF_READ_AHEAD);
+	pb->pb_flags |= flags & (PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_SYNC|PBF_READ_AHEAD);
+
+	if (pb->pb_bn == PAGE_BUF_DADDR_NULL) {
+		BUG();
+	}
+
+	/* For writes call internal function which checks for
+	 * filesystem specific callout function and execute it.
+	 */
+	if (flags & PBF_WRITE) {
+		status = __pagebuf_iorequest(pb);
+	} else {
+		status = pagebuf_iorequest(pb);
+	}
+
+	/* Wait for I/O if we are not an async request */
+	if ((status == 0) && (flags & PBF_ASYNC) == 0) {
+		status = pagebuf_iowait(pb);
+	}
+
+	return status;
+}
+
+/*
+ * Helper routine for pagebuf_iorequest
+ */
+STATIC void
+bio_end_io_pagebuf(
+	struct bio		*bio)
+{
+	page_buf_t		*pb = (page_buf_t *)bio->bi_private;
+	unsigned int		i, blocksize = pb->pb_target->pbr_blocksize;
+	struct bio_vec		*bvec = bio->bi_io_vec;
+
+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		pb->pb_error = EIO;
+
+	for (i = 0; i < bio->bi_vcnt; i++, bvec++) {
+		struct page	*page = bvec->bv_page;
+
+		if (pb->pb_error) {
+			SetPageError(page);
+		} else if (blocksize == PAGE_CACHE_SIZE) {
+			SetPageUptodate(page);
+		} else if (!PagePrivate(page)) {
+			unsigned int	j, range;
+
+			assert(blocksize < PAGE_CACHE_SIZE);
+			assert(!(pb->pb_flags & _PBF_PRIVATE_BH));
+
+			range = (bvec->bv_offset + bvec->bv_len)>>SECTOR_SHIFT;
+			for (j = bvec->bv_offset>>SECTOR_SHIFT; j < range; j++)
+				set_bit(j, &page->private);
+			if (page->private == (unsigned long)(PAGE_CACHE_SIZE-1))
+				SetPageUptodate(page);
+		}
+
+		if (pb->pb_locked) {
+			unlock_page(page);
+		} else {
+			BUG_ON(PageLocked(page));
+		}
+	}
+
+	if (atomic_dec_and_test(&PBP(pb)->pb_io_remaining)) {
+		pb->pb_locked = 0;
+		pagebuf_iodone(pb);
+	}
+
+	bio_put(bio);
+}
+
+/*
+ *	pagebuf_iorequest
+ *
+ *	pagebuf_iorequest is the core I/O request routine.
+ *	It assumes that the buffer is well-formed and
+ *	mapped and ready for physical I/O, unlike
+ *	pagebuf_iostart() and pagebuf_iophysio().  Those
+ *	routines call the pagebuf_ioinitiate routine to start I/O,
+ *	if it is present, or else call pagebuf_iorequest()
+ *	directly if the pagebuf_ioinitiate routine is not present.
+ *
+ *	This function will be responsible for ensuring access to the
+ *	pages is restricted whilst I/O is in progress - for locking
+ *	pagebufs the pagebuf lock is the mediator, for non-locking
+ *	pagebufs the pages will be locked. In the locking case we
+ *	need to use the pagebuf lock as multiple meta-data buffers
+ *	will reference the same page.
+ */
+int
+pagebuf_iorequest(			/* start real I/O		*/
+	page_buf_t		*pb)	/* buffer to convey to device	*/
+{
+	int			status = 0;
+	int			i, map_i, total_nr_pages, nr_pages;
+	struct bio		*bio;
+	struct bio_vec		*bvec;
+	int			offset = pb->pb_offset;
+	int			size = pb->pb_count_desired;
+	sector_t		sector = pb->pb_bn;
+	size_t			blocksize = pb->pb_target->pbr_blocksize;
+	int			locking;
+
+	locking = (pb->pb_flags & _PBF_LOCKABLE) == 0 && (pb->pb_locked == 0);
+
+	PB_TRACE(pb, PB_TRACE_REC(ioreq), 0);
+
+	if (pb->pb_flags & PBF_DELWRI) {
+		pagebuf_delwri_queue(pb, 1);
+		return status;
+	}
+
+	/* Set the count to 1 initially, this will stop an I/O
+	 * completion callout which happens before we have started
+	 * all the I/O from calling iodone too early
+	 */
+	atomic_set(&PBP(pb)->pb_io_remaining, 1);
+
+	/* Special code path for reading a sub page size pagebuf in --
+	 * we populate up the whole page, and hence the other metadata
+	 * in the same page.  This optimization is only valid when the
+	 * filesystem block size and the page size are equal.
+	 */
+	if (unlikely((pb->pb_buffer_length < PAGE_CACHE_SIZE) &&
+	    (pb->pb_flags & PBF_READ) && pb->pb_locked &&
+	    (blocksize == PAGE_CACHE_SIZE))) {
+		bio = bio_alloc(GFP_NOIO, 1);
+
+		bio->bi_bdev = pb->pb_target->pbr_bdev;
+		bio->bi_sector = sector - (offset >> SECTOR_SHIFT);
+		bio->bi_end_io = bio_end_io_pagebuf;
+		bio->bi_private = pb;
+		bio->bi_vcnt++;
+		bio->bi_size = PAGE_CACHE_SIZE;
+
+		bvec = bio->bi_io_vec;
+		bvec->bv_page = pb->pb_pages[0];
+		bvec->bv_len = PAGE_CACHE_SIZE;
+		bvec->bv_offset = 0;
+
+		atomic_inc(&PBP(pb)->pb_io_remaining);
+		submit_bio(READ, bio);
+
+		goto io_submitted;
+	}
+
+	if (pb->pb_flags & PBF_WRITE) {
+		_pagebuf_wait_unpin(pb);
+	}
+
+	/* Lock down the pages which we need to for the request */
+	if (locking) {
+		for (i = 0; size; i++) {
+			int		nbytes = PAGE_CACHE_SIZE - offset;
+			struct page	*page = pb->pb_pages[i];
+
+			if (nbytes > size)
+				nbytes = size;
+
+			lock_page(page);
+
+			size -= nbytes;
+			offset = 0;
+		}
+		offset = pb->pb_offset;
+		size = pb->pb_count_desired;
+		pb->pb_locked = 1;
+	}
+
+	total_nr_pages = pb->pb_page_count;
+	map_i = 0;
+
+next_chunk:
+	atomic_inc(&PBP(pb)->pb_io_remaining);
+	nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - SECTOR_SHIFT);
+	if (nr_pages > total_nr_pages)
+		nr_pages = total_nr_pages;
+
+	bio = bio_alloc(GFP_NOIO, nr_pages);
+
+	BUG_ON(bio == NULL);
+	bio_init(bio);
+	bio->bi_bdev = pb->pb_target->pbr_bdev;
+	bio->bi_sector = sector;
+	bio->bi_end_io = bio_end_io_pagebuf;
+	bio->bi_private = pb;
+
+	bvec = bio->bi_io_vec;
+
+	for (; size && nr_pages; nr_pages--, bvec++, map_i++) {
+		int	nbytes = PAGE_CACHE_SIZE - offset;
+
+		if (nbytes > size)
+			nbytes = size;
+
+		bio->bi_vcnt++;
+		bio->bi_size += nbytes;
+
+		bvec->bv_page = pb->pb_pages[map_i];
+		bvec->bv_len = nbytes;
+		bvec->bv_offset = offset;
+
+		offset = 0;
+
+		sector += nbytes >> SECTOR_SHIFT;
+		size -= nbytes;
+		total_nr_pages--;
+	}
+
+	if (pb->pb_flags & PBF_READ) {
+		submit_bio(READ, bio);
+	} else {
+		submit_bio(WRITE, bio);
+	}
+
+	if (size)
+		goto next_chunk;
+
+io_submitted:
+
+	if (atomic_dec_and_test(&PBP(pb)->pb_io_remaining) == 1) {
+		pagebuf_iodone(pb);
+	} else if ((pb->pb_flags & (PBF_SYNC|PBF_ASYNC)) == PBF_SYNC)  {
+		blk_run_queues();
+	}
+
+	return status < 0 ? status : 0;
+}
+
+/*
+ *	pagebuf_iowait
+ *
+ *	pagebuf_iowait waits for I/O to complete on the buffer supplied.
+ *	It returns immediately if no I/O is pending.  In any case, it returns
+ *	the error code, if any, or 0 if there is no error.
+ */
+int
+pagebuf_iowait(
+	page_buf_t		*pb)
+{
+	PB_TRACE(pb, PB_TRACE_REC(iowait), 0);
+	blk_run_queues();
+	down(&pb->pb_iodonesema);
+	PB_TRACE(pb, PB_TRACE_REC(iowaited), (int)pb->pb_error);
+	return pb->pb_error;
+}
+
+STATIC void *
+pagebuf_mapout_locked(
+	page_buf_t		*pb)
+{
+	void			*old_addr = NULL;
+
+	if (pb->pb_flags & PBF_MAPPED) {
+		if (pb->pb_flags & _PBF_ADDR_ALLOCATED)
+			old_addr = pb->pb_addr - pb->pb_offset;
+		pb->pb_addr = NULL;
+		pb->pb_flags &= ~(PBF_MAPPED | _PBF_ADDR_ALLOCATED);
+	}
+
+	return old_addr;	/* Caller must free the address space,
+				 * we are under a spin lock, probably
+				 * not safe to do vfree here
+				 */
+}
+
+caddr_t
+pagebuf_offset(
+	page_buf_t		*pb,
+	off_t			offset)
+{
+	struct page		*page;
+
+	offset += pb->pb_offset;
+
+	page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT];
+	return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1));
+}
+
+/*
+ *	pagebuf_segment
+ *
+ *	pagebuf_segment is used to retrieve the various contiguous
+ *	segments of a buffer.  The variable addressed by the
+ *	loff_t * should be initialized to 0, and successive
+ *	calls will update to point to the segment following the one
+ *	returned.
+ */
+STATIC void
+pagebuf_segment(
+	page_buf_t		*pb,	/* buffer to examine		*/
+	loff_t			*boff_p,/* offset in buffer of next	*/
+					/* next segment (updated)	*/
+	struct page		**spage_p, /* page (updated)		*/
+					/* (NULL if not in page array)	*/
+	size_t			*soff_p,/* offset in page (updated)	*/
+	size_t			*ssize_p) /* segment length (updated)	*/
+{
+	loff_t			kpboff;	/* offset in pagebuf		*/
+	int			kpi;	/* page index in pagebuf	*/
+	size_t			slen;	/* segment length		*/
+
+	kpboff = *boff_p;
+
+	kpi = page_buf_btoct(kpboff + pb->pb_offset);
+
+	*spage_p = pb->pb_pages[kpi];
+
+	*soff_p = page_buf_poff(kpboff + pb->pb_offset);
+	slen = PAGE_CACHE_SIZE - *soff_p;
+	if (slen > (pb->pb_count_desired - kpboff))
+		slen = (pb->pb_count_desired - kpboff);
+	*ssize_p = slen;
+
+	*boff_p = *boff_p + slen;
+}
+
+/*
+ *	pagebuf_iomove
+ *
+ *	Move data into or out of a buffer.
+ */
+void
+pagebuf_iomove(
+	page_buf_t		*pb,	/* buffer to process		*/
+	off_t			boff,	/* starting buffer offset	*/
+	size_t			bsize,	/* length to copy		*/
+	caddr_t			data,	/* data address			*/
+	page_buf_rw_t		mode)	/* read/write flag		*/
+{
+	loff_t			cboff;
+	size_t			cpoff;
+	size_t			csize;
+	struct page		*page;
+
+	cboff = boff;
+	boff += bsize; /* last */
+
+	while (cboff < boff) {
+		pagebuf_segment(pb, &cboff, &page, &cpoff, &csize);
+		assert(((csize + cpoff) <= PAGE_CACHE_SIZE));
+
+		switch (mode) {
+		case PBRW_ZERO:
+			memset(page_address(page) + cpoff, 0, csize);
+			break;
+		case PBRW_READ:
+			memcpy(data, page_address(page) + cpoff, csize);
+			break;
+		case PBRW_WRITE:
+			memcpy(page_address(page) + cpoff, data, csize);
+		}
+
+		data += csize;
+	}
+}
+
+/*
+ * Pagebuf delayed write buffer handling
+ */
+
+void
+pagebuf_delwri_queue(
+	page_buf_t		*pb,
+	int			unlock)
+{
+	PB_TRACE(pb, PB_TRACE_REC(delwri_q), unlock);
+	spin_lock(&pb_daemon->pb_delwrite_lock);
+	/* If already in the queue, dequeue and place at tail */
+	if (!list_empty(&pb->pb_list)) {
+		if (unlock) {
+			atomic_dec(&pb->pb_hold);
+		}
+		list_del(&pb->pb_list);
+	} else {
+		pb_daemon->pb_delwri_cnt++;
+	}
+	list_add_tail(&pb->pb_list, &pb_daemon->pb_delwrite_l);
+	PBP(pb)->pb_flushtime = jiffies + pb_params.p_un.age_buffer;
+	spin_unlock(&pb_daemon->pb_delwrite_lock);
+
+	if (unlock && (pb->pb_flags & _PBF_LOCKABLE)) {
+		pagebuf_unlock(pb);
+	}
+}
+
+void
+pagebuf_delwri_dequeue(
+	page_buf_t		*pb)
+{
+	PB_TRACE(pb, PB_TRACE_REC(delwri_uq), 0);
+	spin_lock(&pb_daemon->pb_delwrite_lock);
+	list_del_init(&pb->pb_list);
+	pb->pb_flags &= ~PBF_DELWRI;
+	pb_daemon->pb_delwri_cnt--;
+	spin_unlock(&pb_daemon->pb_delwrite_lock);
+}
+
+
+/*
+ * The pagebuf iodone daemon
+ */
+
+STATIC int pb_daemons[NR_CPUS];
+
+STATIC int
+pagebuf_iodone_daemon(
+	void			*__bind_cpu)
+{
+	int			cpu = (long) __bind_cpu;
+	DECLARE_WAITQUEUE	(wait, current);
+
+	/*  Set up the thread  */
+	daemonize();
+
+	/* Avoid signals */
+	spin_lock_irq(&current->sigmask_lock);
+	sigfillset(&current->blocked);
+	recalc_sigpending();
+	spin_unlock_irq(&current->sigmask_lock);
+
+	/* Migrate to the right CPU */
+	set_cpus_allowed(current, 1UL << cpu);
+	if (smp_processor_id() != cpu)
+		BUG();
+
+	sprintf(current->comm, "pagebuf_io_CPU%d", cpu);
+	INIT_LIST_HEAD(&pagebuf_iodone_tq[cpu]);
+	init_waitqueue_head(&pagebuf_iodone_wait[cpu]);
+	__set_current_state(TASK_INTERRUPTIBLE);
+	mb();
+
+	pb_daemons[cpu] = 1;
+
+	for (;;) {
+		add_wait_queue(&pagebuf_iodone_wait[cpu],
+				&wait);
+
+		if (TQ_ACTIVE(pagebuf_iodone_tq[cpu]))
+			__set_task_state(current, TASK_RUNNING);
+		schedule();
+		remove_wait_queue(&pagebuf_iodone_wait[cpu],
+				&wait);
+		run_task_queue(&pagebuf_iodone_tq[cpu]);
+		if (pb_daemons[cpu] == 0)
+			break;
+		__set_current_state(TASK_INTERRUPTIBLE);
+	}
+
+	pb_daemons[cpu] = -1;
+	wake_up_interruptible(&pagebuf_iodone_wait[cpu]);
+	return 0;
+}
+
+/* Defines for pagebuf daemon */
+DECLARE_WAIT_QUEUE_HEAD(pbd_waitq);
+STATIC int force_flush;
+
+STATIC void
+pagebuf_daemon_wakeup(
+	int			flag)
+{
+	force_flush = flag;
+	if (waitqueue_active(&pbd_waitq)) {
+		wake_up_interruptible(&pbd_waitq);
+	}
+}
+
+typedef void (*timeout_fn)(unsigned long);
+
+STATIC int
+pagebuf_daemon(
+	void			*data)
+{
+	int			count;
+	page_buf_t		*pb;
+	struct list_head	*curr, *next, tmp;
+	struct timer_list	pb_daemon_timer =
+		{ {NULL, NULL}, 0, 0, (timeout_fn)pagebuf_daemon_wakeup };
+
+	/*  Set up the thread  */
+	daemonize();
+
+	/* Avoid signals */
+	spin_lock_irq(&current->sigmask_lock);
+	sigfillset(&current->blocked);
+	recalc_sigpending();
+	spin_unlock_irq(&current->sigmask_lock);
+
+	strcpy(current->comm, "pagebufd");
+	current->flags |= PF_MEMALLOC;
+
+	INIT_LIST_HEAD(&tmp);
+	do {
+		if (pb_daemon->active == 1) {
+			del_timer(&pb_daemon_timer);
+			pb_daemon_timer.expires = jiffies +
+					pb_params.p_un.flush_interval;
+			add_timer(&pb_daemon_timer);
+			interruptible_sleep_on(&pbd_waitq);
+		}
+
+		if (pb_daemon->active == 0) {
+			del_timer(&pb_daemon_timer);
+		}
+
+		spin_lock(&pb_daemon->pb_delwrite_lock);
+
+		count = 0;
+		list_for_each_safe(curr, next, &pb_daemon->pb_delwrite_l) {
+			pb = list_entry(curr, page_buf_t, pb_list);
+
+			PB_TRACE(pb, PB_TRACE_REC(walkq1), pagebuf_ispin(pb));
+
+			if ((pb->pb_flags & PBF_DELWRI) && !pagebuf_ispin(pb) &&
+			    (((pb->pb_flags & _PBF_LOCKABLE) == 0) ||
+			     !pagebuf_cond_lock(pb))) {
+
+				if (!force_flush && time_before(jiffies,
+						PBP(pb)->pb_flushtime)) {
+					pagebuf_unlock(pb);
+					break;
+				}
+
+				list_del(&pb->pb_list);
+				list_add(&pb->pb_list, &tmp);
+
+				count++;
+			}
+		}
+
+		spin_unlock(&pb_daemon->pb_delwrite_lock);
+		while (!list_empty(&tmp)) {
+			pb = list_entry(tmp.next,
+							page_buf_t, pb_list);
+			list_del_init(&pb->pb_list);
+			pb->pb_flags &= ~PBF_DELWRI;
+			pb->pb_flags |= PBF_WRITE;
+
+			__pagebuf_iorequest(pb);
+		}
+
+		if (count)
+			blk_run_queues();
+		if (as_list_len > 0)
+			purge_addresses();
+
+		force_flush = 0;
+	} while (pb_daemon->active == 1);
+
+	pb_daemon->active = -1;
+	wake_up_interruptible(&pbd_waitq);
+
+	return 0;
+}
+
+void
+pagebuf_delwri_flush(
+	pb_target_t		*target,
+	u_long			flags,
+	int			*pinptr)
+{
+	page_buf_t		*pb;
+	struct list_head	*curr, *next, tmp;
+	int			pincount = 0;
+
+	spin_lock(&pb_daemon->pb_delwrite_lock);
+	INIT_LIST_HEAD(&tmp);
+
+	list_for_each_safe(curr, next, &pb_daemon->pb_delwrite_l) {
+		pb = list_entry(curr, page_buf_t, pb_list);
+
+		/*
+		 * Skip other targets, markers and in progress buffers
+		 */
+
+		if ((pb->pb_flags == 0) || (pb->pb_target != target) ||
+		    !(pb->pb_flags & PBF_DELWRI)) {
+			continue;
+		}
+
+		PB_TRACE(pb, PB_TRACE_REC(walkq2), pagebuf_ispin(pb));
+		if (pagebuf_ispin(pb)) {
+			pincount++;
+			continue;
+		}
+
+		if (flags & PBDF_TRYLOCK) {
+			if (!pagebuf_cond_lock(pb)) {
+				pincount++;
+				continue;
+			}
+		}
+
+		list_del_init(&pb->pb_list);
+		if (flags & PBDF_WAIT) {
+			list_add(&pb->pb_list, &tmp);
+			pb->pb_flags &= ~PBF_ASYNC;
+		}
+
+		spin_unlock(&pb_daemon->pb_delwrite_lock);
+
+		if ((flags & PBDF_TRYLOCK) == 0) {
+			pagebuf_lock(pb);
+		}
+
+		pb->pb_flags &= ~PBF_DELWRI;
+		pb->pb_flags |= PBF_WRITE;
+
+		__pagebuf_iorequest(pb);
+
+		spin_lock(&pb_daemon->pb_delwrite_lock);
+	}
+
+	spin_unlock(&pb_daemon->pb_delwrite_lock);
+
+	blk_run_queues();
+
+	if (pinptr)
+		*pinptr = pincount;
+
+	if ((flags & PBDF_WAIT) == 0)
+		return;
+
+	while (!list_empty(&tmp)) {
+		pb = list_entry(tmp.next, page_buf_t, pb_list);
+
+		list_del_init(&pb->pb_list);
+		pagebuf_iowait(pb);
+		if (!pb->pb_relse)
+			pagebuf_unlock(pb);
+		pagebuf_rele(pb);
+	}
+}
+
+STATIC int
+pagebuf_daemon_start(void)
+{
+	if (!pb_daemon) {
+		int		cpu;
+
+		pb_daemon = (pagebuf_daemon_t *)
+				kmalloc(sizeof(pagebuf_daemon_t), GFP_KERNEL);
+		if (!pb_daemon) {
+			return -1; /* error */
+		}
+
+		pb_daemon->active = 1;
+		pb_daemon->io_active = 1;
+		pb_daemon->pb_delwri_cnt = 0;
+		pb_daemon->pb_delwrite_lock = SPIN_LOCK_UNLOCKED;
+
+		INIT_LIST_HEAD(&pb_daemon->pb_delwrite_l);
+
+		kernel_thread(pagebuf_daemon, (void *)pb_daemon,
+				CLONE_FS|CLONE_FILES|CLONE_VM);
+		for (cpu = 0; cpu < NR_CPUS; cpu++) {
+			if (!cpu_online(cpu))
+				continue;
+			if (kernel_thread(pagebuf_iodone_daemon,
+					(void *)(long) cpu,
+					CLONE_FS|CLONE_FILES|CLONE_VM) < 0) {
+				printk("pagebuf_daemon_start failed\n");
+			} else {
+				while (!pb_daemons[cpu]) {
+					yield();
+				}
+			}
+		}
+	}
+	return 0;
+}
+
+/*
+ * pagebuf_daemon_stop
+ * 
+ * Note: do not mark as __exit, it is called from pagebuf_terminate.
+ */
+STATIC void
+pagebuf_daemon_stop(void)
+{
+	if (pb_daemon) {
+		int		cpu;
+
+		pb_daemon->active = 0;
+		pb_daemon->io_active = 0;
+
+		wake_up_interruptible(&pbd_waitq);
+		while (pb_daemon->active == 0) {
+			interruptible_sleep_on(&pbd_waitq);
+		}
+		for (cpu = 0; cpu < NR_CPUS; cpu++) {
+			if (!cpu_online(cpu))
+				continue;
+			pb_daemons[cpu] = 0;
+			wake_up(&pagebuf_iodone_wait[cpu]);
+			while (pb_daemons[cpu] != -1) {
+				interruptible_sleep_on(
+					&pagebuf_iodone_wait[cpu]);
+			}
+		}
+
+		kfree(pb_daemon);
+		pb_daemon = NULL;
+	}
+}
+
+
+/*
+ * Pagebuf sysctl interface
+ */
+
+STATIC int
+pb_stats_clear_handler(
+	ctl_table		*ctl,
+	int			write,
+	struct file		*filp,
+	void			*buffer,
+	size_t			*lenp)
+{
+	int			ret;
+	int			*valp = ctl->data;
+
+	ret = proc_doulongvec_minmax(ctl, write, filp, buffer, lenp);
+
+	if (!ret && write && *valp) {
+		printk("XFS Clearing pbstats\n");
+		memset(&pbstats, 0, sizeof(pbstats));
+		pb_params.p_un.stats_clear = 0;
+	}
+
+	return ret;
+}
+
+STATIC struct ctl_table_header *pagebuf_table_header;
+
+STATIC ctl_table pagebuf_table[] = {
+	{PB_FLUSH_INT, "flush_int", &pb_params.data[0],
+	sizeof(ulong), 0644, NULL, &proc_doulongvec_ms_jiffies_minmax,
+	&sysctl_intvec, NULL, &pagebuf_min[0], &pagebuf_max[0]},
+
+	{PB_FLUSH_AGE, "flush_age", &pb_params.data[1],
+	sizeof(ulong), 0644, NULL, &proc_doulongvec_ms_jiffies_minmax,
+	&sysctl_intvec, NULL, &pagebuf_min[1], &pagebuf_max[1]},
+
+	{PB_STATS_CLEAR, "stats_clear", &pb_params.data[3],
+	sizeof(ulong), 0644, NULL, &pb_stats_clear_handler,
+	&sysctl_intvec, NULL, &pagebuf_min[3], &pagebuf_max[3]},
+
+#ifdef PAGEBUF_TRACE
+	{PB_DEBUG, "debug", &pb_params.data[4],
+	sizeof(ulong), 0644, NULL, &proc_doulongvec_minmax,
+	&sysctl_intvec, NULL, &pagebuf_min[4], &pagebuf_max[4]},
+#endif
+	{0}
+};
+
+STATIC ctl_table pagebuf_dir_table[] = {
+	{VM_PAGEBUF, "pagebuf", NULL, 0, 0555, pagebuf_table},
+	{0}
+};
+
+STATIC ctl_table pagebuf_root_table[] = {
+	{CTL_VM, "vm",	NULL, 0, 0555, pagebuf_dir_table},
+	{0}
+};
+
+#ifdef CONFIG_PROC_FS
+STATIC int
+pagebuf_readstats(
+	char			*buffer,
+	char			**start,
+	off_t			offset,
+	int			count,
+	int			*eof,
+	void			*data)
+{
+	int			i, len;
+
+	len = 0;
+	len += sprintf(buffer + len, "pagebuf");
+	for (i = 0; i < sizeof(pbstats) / sizeof(u_int32_t); i++) {
+		len += sprintf(buffer + len, " %u",
+			*(((u_int32_t*)&pbstats) + i));
+	}
+	buffer[len++] = '\n';
+
+	if (offset >= len) {
+		*start = buffer;
+		*eof = 1;
+		return 0;
+	}
+	*start = buffer + offset;
+	if ((len -= offset) > count)
+		return count;
+	*eof = 1;
+
+	return len;
+}
+#endif	/* CONFIG_PROC_FS */
+
+STATIC void
+pagebuf_shaker(void)
+{
+	pagebuf_daemon_wakeup(1);
+}
+
+
+/*
+ *	Initialization and Termination
+ */
+
+int __init
+pagebuf_init(void)
+{
+	int			i;
+
+	pagebuf_table_header = register_sysctl_table(pagebuf_root_table, 1);
+
+#ifdef CONFIG_PROC_FS
+	if (proc_mkdir("fs/pagebuf", 0))
+		create_proc_read_entry(
+			"fs/pagebuf/stat", 0, 0, pagebuf_readstats, NULL);
+#endif
+
+	pagebuf_cache = kmem_cache_create("page_buf_t",
+			sizeof(page_buf_private_t), 0,
+			SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (pagebuf_cache == NULL) {
+		printk("pagebuf: couldn't init pagebuf cache\n");
+		pagebuf_terminate();
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < NHASH; i++) {
+		spin_lock_init(&pbhash[i].pb_hash_lock);
+		INIT_LIST_HEAD(&pbhash[i].pb_hash);
+	}
+
+#ifdef PAGEBUF_TRACE
+# if 1
+	pb_trace.buf = (pagebuf_trace_t *)kmalloc(
+			PB_TRACE_BUFSIZE * sizeof(pagebuf_trace_t), GFP_KERNEL);
+# else
+	/* Alternatively, for really really long trace bufs */
+	pb_trace.buf = (pagebuf_trace_t *)vmalloc(
+			PB_TRACE_BUFSIZE * sizeof(pagebuf_trace_t));
+# endif
+	memset(pb_trace.buf, 0, PB_TRACE_BUFSIZE * sizeof(pagebuf_trace_t));
+	pb_trace.start = 0;
+	pb_trace.end = PB_TRACE_BUFSIZE - 1;
+#endif
+
+	pagebuf_daemon_start();
+	kmem_shake_register(pagebuf_shaker);
+	return 0;
+}
+
+
+/*
+ *	pagebuf_terminate. 
+ * 
+ *	Note: do not mark as __exit, this is also called from the __init code.
+ */
+void
+pagebuf_terminate(void)
+{
+	pagebuf_daemon_stop();
+
+	kmem_cache_destroy(pagebuf_cache);
+	kmem_shake_deregister(pagebuf_shaker);
+
+	unregister_sysctl_table(pagebuf_table_header);
+#ifdef	CONFIG_PROC_FS
+	remove_proc_entry("fs/pagebuf/stat", NULL);
+	remove_proc_entry("fs/pagebuf", NULL);
+#endif
+}
+
+
+/*
+ *	Module management (for kernel debugger module)
+ */
+EXPORT_SYMBOL(pagebuf_offset);
diff --git a/fs/xfs/pagebuf/page_buf.h b/fs/xfs/pagebuf/page_buf.h
new file mode 100644
index 000000000000..88810e355316
--- /dev/null
+++ b/fs/xfs/pagebuf/page_buf.h
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * Written by Steve Lord, Jim Mostek, Russell Cattelan at SGI
+ */
+
+#ifndef __PAGE_BUF_H__
+#define __PAGE_BUF_H__
+
+#include <linux/version.h>
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <asm/system.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/uio.h>
+#include <linux/tqueue.h>
+
+enum xfs_buffer_state { BH_Delay = BH_PrivateStart };
+BUFFER_FNS(Delay, delay);
+
+/*
+ * Turn this on to get pagebuf lock ownership
+#define PAGEBUF_LOCK_TRACKING
+*/
+
+/*
+ *	Base types
+ */
+
+/* daddr must be signed since -1 is used for bmaps that are not yet allocated */
+typedef loff_t page_buf_daddr_t;
+
+#define PAGE_BUF_DADDR_NULL ((page_buf_daddr_t) (-1LL))
+
+typedef size_t page_buf_dsize_t;		/* size of buffer in blocks */
+
+#define page_buf_ctob(pp)	((pp) * PAGE_CACHE_SIZE)
+#define page_buf_btoc(dd)	(((dd) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT)
+#define page_buf_btoct(dd)	((dd) >> PAGE_CACHE_SHIFT)
+#define page_buf_poff(aa)	((aa) & ~PAGE_CACHE_MASK)
+
+typedef enum page_buf_rw_e {
+	PBRW_READ = 1,			/* transfer into target memory */
+	PBRW_WRITE = 2,			/* transfer from target memory */
+	PBRW_ZERO = 3			/* Zero target memory */
+} page_buf_rw_t;
+
+typedef enum {				/* pbm_flags values */
+	PBMF_EOF =		0x01,	/* mapping contains EOF		*/
+	PBMF_HOLE =		0x02,	/* mapping covers a hole	*/
+	PBMF_DELAY =		0x04,	/* mapping covers delalloc region  */
+	PBMF_UNWRITTEN =	0x20	/* mapping covers allocated	*/
+					/* but uninitialized XFS data	*/
+} bmap_flags_t;
+
+typedef enum page_buf_flags_e {		/* pb_flags values */
+	PBF_READ = (1 << 0),	/* buffer intended for reading from device */
+	PBF_WRITE = (1 << 1),	/* buffer intended for writing to device   */
+	PBF_MAPPED = (1 << 2),	/* buffer mapped (pb_addr valid)	   */
+	PBF_PARTIAL = (1 << 3), /* buffer partially read		   */
+	PBF_ASYNC = (1 << 4),	/* initiator will not wait for completion  */
+	PBF_NONE = (1 << 5),	/* buffer not read at all		   */
+	PBF_DELWRI = (1 << 6),	/* buffer has dirty pages		   */
+	PBF_FREED = (1 << 7),	/* buffer has been freed and is invalid	   */
+	PBF_SYNC = (1 << 8),	/* force updates to disk		   */
+	PBF_MAPPABLE = (1 << 9),/* use directly-addressable pages	   */
+	PBF_STALE = (1 << 10),	/* buffer has been staled, do not find it  */
+	PBF_FS_MANAGED = (1 << 11), /* filesystem controls freeing memory  */
+	PBF_RELEASE = (1 << 12),/* buffer to be released after I/O is done */
+
+	/* flags used only as arguments to access routines */
+	PBF_LOCK = (1 << 13),	/* lock requested			   */
+	PBF_TRYLOCK = (1 << 14), /* lock requested, but do not wait	   */
+	PBF_ALLOCATE = (1 << 15), /* allocate all pages		  (UNUSED) */
+	PBF_FILE_ALLOCATE = (1 << 16), /* allocate all file space	   */
+	PBF_DONT_BLOCK = (1 << 17), /* do not block in current thread	   */
+	PBF_DIRECT = (1 << 18),	  /* direct I/O desired			   */
+	PBF_ENTER_PAGES = (1 << 21), /* create invalid pages for all	   */
+				/* pages in the range of the buffer	   */
+				/* not already associated with buffer	   */
+
+	/* flags used only internally */
+	_PBF_LOCKABLE = (1 << 19), /* page_buf_t may be locked		   */
+	_PBF_PRIVATE_BH = (1 << 20), /* do not use public buffer heads	   */
+	_PBF_ALL_PAGES_MAPPED = (1 << 22),
+				/* all pages in rage are mapped		   */
+	_PBF_SOME_INVALID_PAGES = (1 << 23),
+				/* some mapped pages are not valid	   */
+	_PBF_ADDR_ALLOCATED = (1 << 24),
+				/* pb_addr space was allocated		   */
+	_PBF_MEM_ALLOCATED = (1 << 25),
+				/* pb_mem and underlying pages allocated   */
+
+	PBF_FORCEIO = (1 << 27),
+	PBF_FLUSH = (1 << 28),	/* flush disk write cache */
+	PBF_READ_AHEAD = (1 << 29),
+	PBF_FS_RESERVED_3 = (1 << 31)	/* reserved (XFS use: XFS_B_STALE) */
+
+} page_buf_flags_t;
+
+#define PBF_UPDATE (PBF_READ | PBF_WRITE)
+#define PBF_NOT_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) != 0)
+#define PBF_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) == 0)
+
+#define PBR_SECTOR_ONLY	1	/* only use sector size buffer heads */
+#define PBR_ALIGNED_ONLY 2	/* only use aligned I/O */
+
+typedef struct pb_target {
+	int			pbr_flags;
+	dev_t			pbr_dev;
+	struct block_device	*pbr_bdev;
+	struct address_space	*pbr_mapping;
+	unsigned int		pbr_blocksize;
+	unsigned int		pbr_blocksize_bits;
+} pb_target_t;
+
+/*
+ *	page_buf_bmap_t:  File system I/O map
+ *
+ * The pbm_bn, pbm_offset and pbm_length fields are expressed in disk blocks.
+ * The pbm_length field specifies the size of the underlying backing store
+ * for the particular mapping.
+ *
+ * The pbm_bsize, pbm_size and pbm_delta fields are in bytes and indicate
+ * the size of the mapping, the number of bytes that are valid to access
+ * (read or write), and the offset into the mapping, given the offset
+ * supplied to the file I/O map routine.  pbm_delta is the offset of the
+ * desired data from the beginning of the mapping.
+ *
+ * When a request is made to read beyond the logical end of the object,
+ * pbm_size may be set to 0, but pbm_offset and pbm_length should be set to
+ * the actual amount of underlying storage that has been allocated, if any.
+ */
+
+typedef struct page_buf_bmap_s {
+	page_buf_daddr_t pbm_bn;	/* block number in file system	    */
+	pb_target_t	*pbm_target;	/* device to do I/O to		    */
+	loff_t		pbm_offset;	/* byte offset of mapping in file   */
+	size_t		pbm_delta;	/* offset of request into bmap	    */
+	size_t		pbm_bsize;	/* size of this mapping in bytes    */
+	bmap_flags_t	pbm_flags;	/* options flags for mapping	    */
+} page_buf_bmap_t;
+
+typedef page_buf_bmap_t pb_bmap_t;
+
+
+/*
+ *	page_buf_t:  Buffer structure for page cache-based buffers
+ *
+ * This buffer structure is used by the page cache buffer management routines
+ * to refer to an assembly of pages forming a logical buffer.  The actual
+ * I/O is performed with buffer_head or bio structures, as required by drivers,
+ * for drivers which do not understand this structure.  The buffer structure is
+ * used on temporary basis only, and discarded when released.  
+ *
+ * The real data storage is recorded in the page cache.	 Metadata is
+ * hashed to the inode for the block device on which the file system resides.
+ * File data is hashed to the inode for the file.  Pages which are only
+ * partially filled with data have bits set in their block_map entry
+ * to indicate which disk blocks in the page are not valid.
+ */
+
+struct page_buf_s;
+typedef void (*page_buf_iodone_t)(struct page_buf_s *);
+			/* call-back function on I/O completion */
+typedef void (*page_buf_relse_t)(struct page_buf_s *);
+			/* call-back function on I/O completion */
+typedef int (*page_buf_bdstrat_t)(struct page_buf_s *);
+
+#define PB_PAGES	4
+
+typedef struct page_buf_s {
+	struct list_head	pb_list;
+	page_buf_flags_t	pb_flags;	/* status flags */
+	struct list_head	pb_hash_list;
+	struct pb_target	*pb_target;	/* logical object */
+	atomic_t		pb_hold;	/* reference count */
+	page_buf_daddr_t	pb_bn;		/* block number for I/O */
+	loff_t			pb_file_offset; /* offset in file */
+	size_t			pb_buffer_length; /* size of buffer in bytes */
+	size_t			pb_count_desired; /* desired transfer size */
+	void			*pb_addr;	/* virtual address of buffer */
+	struct tq_struct	pb_iodone_sched;
+	page_buf_iodone_t	pb_iodone;	/* I/O completion function */
+	page_buf_relse_t	pb_relse;	/* releasing function */
+	page_buf_bdstrat_t	pb_strat;	/* pre-write function */
+	struct semaphore	pb_iodonesema;	/* Semaphore for I/O waiters */
+	void			*pb_fspriv;
+	void			*pb_fspriv2;
+	void			*pb_fspriv3;
+	unsigned short		pb_error;	/* error code on I/O */
+	unsigned short		pb_page_count;	/* size of page array */
+	unsigned short		pb_offset;	/* page offset in first page */
+	unsigned char		pb_locked;	/* page array is locked */
+	unsigned char		pb_hash_index;	/* hash table index	*/
+	struct page		**pb_pages;	/* array of page pointers */
+	struct page		*pb_page_array[PB_PAGES]; /* inline pages */
+} page_buf_t;
+
+
+/*
+ * page_buf module entry points
+ */
+
+/* Finding and Reading Buffers */
+
+extern page_buf_t *pagebuf_find(	/* find buffer for block if	*/
+					/* the block is in memory	*/
+		struct pb_target *,	/* inode for block		*/
+		loff_t,			/* starting offset of range	*/
+		size_t,			/* length of range		*/
+		page_buf_flags_t);	/* PBF_LOCK			*/
+
+extern page_buf_t *pagebuf_get(		/* allocate a buffer		*/
+		struct pb_target *,	/* inode for buffer		*/
+		loff_t,			/* starting offset of range	*/
+		size_t,			/* length of range		*/
+		page_buf_flags_t);	/* PBF_LOCK, PBF_READ, PBF_ALLOCATE, */
+					/* PBF_ASYNC,			*/	
+
+extern page_buf_t *pagebuf_lookup(
+		struct pb_target *,
+		struct inode *,
+		loff_t,			/* starting offset of range	*/
+		size_t,			/* length of range		*/
+		int);			/* PBF_ENTER_PAGES		*/
+
+extern page_buf_t *pagebuf_get_empty(	/* allocate pagebuf struct with */
+					/*  no memory or disk address	*/
+		struct pb_target *);	/* mount point "fake" inode	*/
+
+extern page_buf_t *pagebuf_get_no_daddr(/* allocate pagebuf struct	*/
+					/* without disk address		*/
+		size_t len,
+		struct pb_target *);	/* mount point "fake" inode	*/
+
+extern int	pagebuf_associate_memory(
+		page_buf_t *,
+		void *,
+		size_t);
+
+
+extern void pagebuf_hold(		/* increment reference count	*/
+		page_buf_t *);		/* buffer to hold		*/
+
+extern void pagebuf_readahead(		/* read ahead into cache	*/
+		struct pb_target  *,	/* target for buffer (or NULL)	*/
+		loff_t,			/* starting offset of range	*/
+		size_t,			/* length of range		*/
+		int);			/* additional read flags	*/
+
+/* Writing and Releasing Buffers */
+
+extern void pagebuf_free(		/* deallocate a buffer		*/
+		page_buf_t *);		/* buffer to deallocate		*/
+
+extern void pagebuf_rele(		/* release hold on a buffer	*/
+		page_buf_t *);		/* buffer to release		*/
+
+/* Locking and Unlocking Buffers */
+
+extern int pagebuf_cond_lock(		/* lock buffer, if not locked	*/
+					/* (returns -EBUSY if locked)	*/
+		page_buf_t *);		/* buffer to lock		*/
+
+extern int pagebuf_lock_value(		/* return count on lock		*/
+		page_buf_t *);		/* buffer to check		*/
+
+extern int pagebuf_lock(		/* lock buffer			*/
+		page_buf_t *);		/* buffer to lock		*/
+
+extern void pagebuf_lock_disable(	/* disable buffer locking	*/
+		struct pb_target *,	/* inode for buffers		*/
+		int);			/* do blkdev_put?		*/
+
+extern struct pb_target *pagebuf_lock_enable(
+		dev_t,
+		int);			/* do blkdev_get?		*/
+
+extern void pagebuf_target_blocksize(
+		pb_target_t *,
+		unsigned int);		/* block size			*/
+
+extern void pagebuf_target_clear(struct pb_target *);
+
+extern void pagebuf_unlock(		/* unlock buffer		*/
+		page_buf_t *);		/* buffer to unlock		*/
+
+/* Buffer Utility Routines */
+
+#define pagebuf_geterror(pb)	((pb)->pb_error)
+
+extern void pagebuf_queue_task(
+		struct tq_struct *);
+
+extern void pagebuf_iodone(		/* mark buffer I/O complete	*/
+		page_buf_t *);		/* buffer to mark		*/
+
+extern void pagebuf_ioerror(		/* mark buffer in error (or not) */
+		page_buf_t *,		/* buffer to mark		*/
+		unsigned int);		/* error to store (0 if none)	*/
+
+extern int pagebuf_iostart(		/* start I/O on a buffer	*/
+		page_buf_t *,		/* buffer to start		*/
+		page_buf_flags_t);	/* PBF_LOCK, PBF_ASYNC, PBF_READ, */
+					/* PBF_WRITE, PBF_ALLOCATE,	*/
+					/* PBF_DELWRI, 			*/
+					/* PBF_SYNC			*/
+
+extern int pagebuf_iorequest(		/* start real I/O		*/
+		page_buf_t *);		/* buffer to convey to device	*/
+
+	/*
+	 * pagebuf_iorequest is the core I/O request routine.
+	 * It assumes that the buffer is well-formed and
+	 * mapped and ready for physical I/O, unlike
+	 * pagebuf_iostart() and pagebuf_iophysio().  Those
+	 * routines call the inode pagebuf_ioinitiate routine to start I/O,
+	 * if it is present, or else call pagebuf_iorequest()
+	 * directly if the inode pagebuf_ioinitiate routine is not present.
+	 */
+
+extern int pagebuf_iowait(		/* wait for buffer I/O done	*/
+		page_buf_t *);		/* buffer to wait on		*/
+
+extern caddr_t	pagebuf_offset(page_buf_t *, off_t);
+
+extern void pagebuf_iomove(		/* move data in/out of pagebuf	*/
+		page_buf_t *,		/* buffer to manipulate		*/
+		off_t,			/* starting buffer offset	*/
+		size_t,			/* length in buffer		*/
+		caddr_t,		/* data pointer			*/
+		page_buf_rw_t);		/* direction			*/
+
+/* Pinning Buffer Storage in Memory */
+
+extern void pagebuf_pin(		/* pin buffer in memory		*/
+		page_buf_t *);		/* buffer to pin		*/
+
+extern void pagebuf_unpin(		/* unpin buffered data		*/
+		page_buf_t *);		/* buffer to unpin		*/
+
+extern int pagebuf_ispin( page_buf_t *); /* check if pagebuf is pinned	*/
+
+/* Reading and writing pages */
+
+extern int pagebuf_write_full_page(	/* write a page via pagebuf	*/
+		struct page *,		/* page to write		*/
+		int delalloc);		/* delalloc bh present		*/
+
+extern int pagebuf_release_page(	/* Attempt to convert a delalloc page */
+		struct page *);		/* page to release		*/
+
+extern void pagebuf_delwri_queue(page_buf_t *, int);
+extern void pagebuf_delwri_dequeue(page_buf_t *);
+
+#define PBDF_WAIT    0x01
+#define PBDF_TRYLOCK 0x02
+extern void pagebuf_delwri_flush(
+		struct pb_target *,
+		unsigned long,
+		int *);
+
+extern int pagebuf_init(void);
+extern void pagebuf_terminate(void);
+
+static __inline__ int __pagebuf_iorequest(page_buf_t *pb)
+{
+	if (pb->pb_strat)
+		return pb->pb_strat(pb);
+	return pagebuf_iorequest(pb);
+}
+
+#endif /* __PAGE_BUF_H__ */
diff --git a/fs/xfs/pagebuf/page_buf_internal.h b/fs/xfs/pagebuf/page_buf_internal.h
new file mode 100644
index 000000000000..4c0e85303b17
--- /dev/null
+++ b/fs/xfs/pagebuf/page_buf_internal.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * Written by Steve Lord at SGI
+ */
+
+#ifndef __PAGE_BUF_PRIVATE_H__
+#define __PAGE_BUF_PRIVATE_H__
+
+#include "page_buf.h"
+
+#define _PAGE_BUF_INTERNAL_
+#define PB_DEFINE_TRACES
+#include "page_buf_trace.h"
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,9)
+#define page_buffers(page)	((page)->buffers)
+#define page_has_buffers(page)	((page)->buffers)
+#endif
+
+typedef struct page_buf_private_s {
+	page_buf_t		pb_common;	/* public part of structure */
+	struct semaphore	pb_sema;	/* semaphore for lockables  */
+	unsigned long		pb_flushtime;	/* time to flush pagebuf    */
+	atomic_t		pb_io_remaining;/* #outstanding I/O requests */
+	atomic_t		pb_pin_count;	/* pin count		    */
+	wait_queue_head_t	pb_waiters;	/* unpin waiters	    */
+#ifdef PAGEBUF_LOCK_TRACKING
+	int			pb_last_holder;
+#endif
+} page_buf_private_t;
+
+#define PBC(pb) (&((pb)->pb_common))
+#define PBP(pb) ((page_buf_private_t *) (pb))
+
+#ifdef PAGEBUF_LOCK_TRACKING
+#define PB_SET_OWNER(pb)	(PBP(pb)->pb_last_holder = current->pid)
+#define PB_CLEAR_OWNER(pb)	(PBP(pb)->pb_last_holder = -1)
+#define PB_GET_OWNER(pb)	(PBP(pb)->pb_last_holder)
+#else
+#define PB_SET_OWNER(pb)
+#define PB_CLEAR_OWNER(pb)
+#define PB_GET_OWNER(pb)
+#endif /* PAGEBUF_LOCK_TRACKING */
+
+/* Tracing utilities for pagebuf */
+typedef struct {
+	int			event;
+	unsigned long		pb;
+	page_buf_flags_t	flags;
+	unsigned short		hold;
+	unsigned short		lock_value;
+	void			*task;
+	void			*misc;
+	void			*ra;
+	loff_t			offset;
+	size_t			size;
+} pagebuf_trace_t;
+
+struct pagebuf_trace_buf {
+	pagebuf_trace_t		*buf;
+	volatile int		start;
+	volatile int		end;
+};
+
+#define PB_TRACE_BUFSIZE	1024
+#define CIRC_INC(i)	(((i) + 1) & (PB_TRACE_BUFSIZE - 1))
+
+typedef struct pagebuf_daemon {
+	int			active;
+	int			io_active;
+	spinlock_t		pb_delwrite_lock;
+	struct list_head	pb_delwrite_l;
+	int			pb_delwri_cnt;
+} pagebuf_daemon_t;
+
+/*
+ * Tunable pagebuf parameters
+ */
+
+#define P_PARAM 4
+
+typedef union pagebuf_param {
+	struct {
+		ulong	flush_interval; /* interval between runs of the
+					 * delwri flush daemon.	 */
+		ulong	age_buffer;	/* time for buffer to age before
+					 * we flush it.	 */
+		ulong	debug;		/* debug tracing on or off */
+		ulong	stats_clear;	/* clear the pagebuf stats */
+	} p_un;
+	ulong data[P_PARAM];
+} pagebuf_param_t;
+
+enum {
+	PB_FLUSH_INT = 1,
+	PB_FLUSH_AGE = 2,
+	PB_STATS_CLEAR = 3,
+	PB_DEBUG = 4
+};
+
+extern pagebuf_param_t	pb_params;
+
+/*
+ * Pagebuf statistics
+ */
+
+struct pbstats {
+	u_int32_t	pb_get;
+	u_int32_t	pb_create;
+	u_int32_t	pb_get_locked;
+	u_int32_t	pb_get_locked_waited;
+	u_int32_t	pb_busy_locked;
+	u_int32_t	pb_miss_locked;
+	u_int32_t	pb_page_retries;
+	u_int32_t	pb_page_found;
+	u_int32_t	pb_get_read;
+};
+
+extern struct pbstats pbstats;
+
+#define PB_STATS_INC(count)	( count ++ )
+
+#undef assert
+#ifdef PAGEBUF_DEBUG
+# define assert(expr) \
+	if (!(expr)) {						\
+		printk("Assertion failed: %s\n%s::%s line %d\n",\
+		#expr,__FILE__,__FUNCTION__,__LINE__);		\
+		BUG();						\
+	}
+#else
+# define assert(x)	do { } while (0)
+#endif
+
+#ifndef STATIC
+# define STATIC static
+#endif
+
+#endif /* __PAGE_BUF_PRIVATE_H__ */
diff --git a/fs/xfs/pagebuf/page_buf_locking.c b/fs/xfs/pagebuf/page_buf_locking.c
new file mode 100644
index 000000000000..0fdd04fc7e8b
--- /dev/null
+++ b/fs/xfs/pagebuf/page_buf_locking.c
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ * Portions Copyright (c) 2002 Christoph Hellwig.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ *	page_buf_locking.c
+ *
+ *	The page_buf module provides an abstract buffer cache model on top of
+ *	the Linux page cache.  Cached blocks for a file are hashed to the
+ *	inode for that file, and can be held dirty in delayed write mode in
+ *	the page cache.	 Cached metadata blocks for a file system are hashed
+ *	to the inode for the mounted device.  The page_buf module assembles
+ *	buffer (page_buf_t) objects on demand to aggregate such cached pages
+ *	for I/O.  The page_buf_locking module adds support for locking such
+ *	page buffers.
+ *
+ *	Written by Steve Lord at SGI
+ *
+ */
+
+#include <linux/stddef.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/major.h>
+
+#include "page_buf_internal.h"
+
+#ifndef EVMS_MAJOR
+#define EVMS_MAJOR      117
+#endif
+
+/*
+ *	pagebuf_cond_lock
+ *
+ *	pagebuf_cond_lock locks a buffer object, if it is not already locked.
+ *	Note that this in no way
+ *	locks the underlying pages, so it is only useful for synchronizing
+ *	concurrent use of page buffer objects, not for synchronizing independent
+ *	access to the underlying pages.
+ */
+int
+pagebuf_cond_lock(			/* lock buffer, if not locked	*/
+					/* returns -EBUSY if locked)	*/
+	page_buf_t		*pb)
+{
+	int			locked;
+
+	assert(pb->pb_flags & _PBF_LOCKABLE);
+
+	locked = down_trylock(&PBP(pb)->pb_sema) == 0;
+	if (locked) {
+		PB_SET_OWNER(pb);
+	}
+
+	PB_TRACE(pb, PB_TRACE_REC(condlck), locked);
+
+	return(locked ? 0 : -EBUSY);
+}
+
+/*
+ *	pagebuf_lock_value
+ *
+ *	Return lock value for a pagebuf
+ */
+int
+pagebuf_lock_value(
+	page_buf_t		*pb)
+{
+	assert(pb->pb_flags & _PBF_LOCKABLE);
+	return(atomic_read(&PBP(pb)->pb_sema.count));
+}
+
+/*
+ *	pagebuf_lock
+ *
+ *	pagebuf_lock locks a buffer object.  Note that this in no way
+ *	locks the underlying pages, so it is only useful for synchronizing
+ *	concurrent use of page buffer objects, not for synchronizing independent
+ *	access to the underlying pages.
+ */
+int
+pagebuf_lock(
+	page_buf_t		*pb)
+{
+	assert(pb->pb_flags & _PBF_LOCKABLE);
+
+	PB_TRACE(pb, PB_TRACE_REC(lock), 0);
+	if (atomic_read(&PBP(pb)->pb_io_remaining))
+		blk_run_queues();
+	down(&PBP(pb)->pb_sema);
+	PB_SET_OWNER(pb);
+	PB_TRACE(pb, PB_TRACE_REC(locked), 0);
+	return 0;
+}
+
+/*
+ *	pagebuf_lock_disable
+ *
+ *	pagebuf_lock_disable disables buffer object locking for an inode.
+ *	remove_super() does a blkdev_put for us on the data device, hence
+ * 	the do_blkdev_put argument.
+ */
+void
+pagebuf_lock_disable(
+	pb_target_t		*target,
+	int			do_blkdev_put)
+{
+	pagebuf_delwri_flush(target, PBDF_WAIT, NULL);
+	if (do_blkdev_put)
+		blkdev_put(target->pbr_bdev, BDEV_FS);
+	kfree(target);
+}
+
+/*
+ *	pagebuf_lock_enable
+ *
+ *	get_sb_bdev() does a blkdev_get for us on the data device, hence
+ *	the do_blkdev_get argument.
+ */
+pb_target_t *
+pagebuf_lock_enable(
+	dev_t			dev,
+	int			do_blkdev_get)
+{
+	struct block_device	*bdev;
+	pb_target_t		*target;
+	int			error = -ENOMEM;
+
+	target = kmalloc(sizeof(pb_target_t), GFP_KERNEL);
+	if (unlikely(!target))
+		return ERR_PTR(error);
+
+	bdev = bdget(dev);
+	if (unlikely(!bdev))
+		goto fail;
+
+	if (do_blkdev_get) {
+		error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_FS);
+		if (unlikely(error))
+			goto fail;
+	}
+
+	target->pbr_dev = dev;
+	target->pbr_bdev = bdev;
+	target->pbr_mapping = bdev->bd_inode->i_mapping;
+
+	pagebuf_target_blocksize(target, PAGE_CACHE_SIZE);
+	
+	if ((MAJOR(dev) == MD_MAJOR) || (MAJOR(dev) == EVMS_MAJOR))
+		target->pbr_flags = PBR_ALIGNED_ONLY;
+	else if (MAJOR(dev) == LVM_BLK_MAJOR)
+		target->pbr_flags = PBR_SECTOR_ONLY;
+	else
+		target->pbr_flags = 0;
+
+	return target;
+
+fail:
+	kfree(target);
+	return ERR_PTR(error);
+}
+
+void
+pagebuf_target_blocksize(
+	pb_target_t		*target,
+	unsigned int		blocksize)
+{
+	target->pbr_blocksize = blocksize;
+	target->pbr_blocksize_bits = ffs(blocksize) - 1;
+}
+
+void
+pagebuf_target_clear(
+	pb_target_t		*target)
+{
+	invalidate_bdev(target->pbr_bdev, 1);
+	truncate_inode_pages(target->pbr_mapping, 0LL);
+}
+
+/*
+ *	pagebuf_unlock
+ *
+ *	pagebuf_unlock releases the lock on the buffer object created by
+ *	pagebuf_lock or pagebuf_cond_lock (not any
+ *	pinning of underlying pages created by pagebuf_pin).
+ */
+void
+pagebuf_unlock(				/* unlock buffer		*/
+	page_buf_t		*pb)	/* buffer to unlock		*/
+{
+	assert(pb->pb_flags & _PBF_LOCKABLE);
+	PB_CLEAR_OWNER(pb);
+	up(&PBP(pb)->pb_sema);
+	PB_TRACE(pb, PB_TRACE_REC(unlock), 0);
+}
diff --git a/fs/xfs/pagebuf/page_buf_trace.h b/fs/xfs/pagebuf/page_buf_trace.h
new file mode 100644
index 000000000000..51caa533f269
--- /dev/null
+++ b/fs/xfs/pagebuf/page_buf_trace.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#ifndef __PAGEBUF_TRACE__
+#define __PAGEBUF_TRACE__
+
+#ifdef PB_DEFINE_TRACES
+#define PB_TRACE_START	typedef enum {
+#define PB_TRACE_REC(x) pb_trace_point_##x
+#define PB_TRACE_END	} pb_trace_var_t;
+#else
+#define PB_TRACE_START	static char	*event_names[] = {
+#define PB_TRACE_REC(x) #x
+#define PB_TRACE_END	};
+#endif
+
+PB_TRACE_START
+PB_TRACE_REC(get),
+PB_TRACE_REC(get_obj),
+PB_TRACE_REC(free_obj),
+PB_TRACE_REC(look_pg),
+PB_TRACE_REC(get_read),
+PB_TRACE_REC(no_daddr),
+PB_TRACE_REC(hold),
+PB_TRACE_REC(rele),
+PB_TRACE_REC(done),
+PB_TRACE_REC(ioerror),
+PB_TRACE_REC(iostart),
+PB_TRACE_REC(end_io),
+PB_TRACE_REC(do_io),
+PB_TRACE_REC(ioreq),
+PB_TRACE_REC(iowait),
+PB_TRACE_REC(iowaited),
+PB_TRACE_REC(free_lk),
+PB_TRACE_REC(freed_l),
+PB_TRACE_REC(cmp),
+PB_TRACE_REC(get_lk),
+PB_TRACE_REC(got_lk),
+PB_TRACE_REC(skip),
+PB_TRACE_REC(lock),
+PB_TRACE_REC(locked),
+PB_TRACE_REC(unlock),
+PB_TRACE_REC(avl_ret),
+PB_TRACE_REC(condlck),
+PB_TRACE_REC(avl_ins),
+PB_TRACE_REC(walkq1),
+PB_TRACE_REC(walkq2),
+PB_TRACE_REC(walkq3),
+PB_TRACE_REC(delwri_q),
+PB_TRACE_REC(delwri_uq),
+PB_TRACE_REC(pin),
+PB_TRACE_REC(unpin),
+PB_TRACE_REC(file_write),
+PB_TRACE_REC(external),
+PB_TRACE_END
+
+extern void pb_trace_func(page_buf_t *, int, void *, void *);
+#ifdef PAGEBUF_TRACE
+# define PB_TRACE(pb, event, misc)		\
+	pb_trace_func(pb, event, (void *) misc, \
+			(void *)__builtin_return_address(0))
+#else
+# define PB_TRACE(pb, event, misc)	do { } while (0)
+#endif
+
+#endif	/* __PAGEBUF_TRACE__ */
diff --git a/fs/xfs/support/atomic.h b/fs/xfs/support/atomic.h
new file mode 100644
index 000000000000..1cf5eecbf4fe
--- /dev/null
+++ b/fs/xfs/support/atomic.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_ATOMIC_H__
+#define __XFS_SUPPORT_ATOMIC_H__
+
+#include <linux/version.h>
+#include <linux/time.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+
+/*
+ * This is used for two variables in XFS, one of which is a debug trace
+ * buffer index. They are not accessed via any other atomic operations
+ * so this is safe. All other atomic increments and decrements in XFS
+ * now use the linux built in functions.
+ */
+
+extern spinlock_t Atomic_spin;
+
+static __inline__ int atomicIncWithWrap(int *ip, int val)
+{
+	unsigned long flags;
+	int ret;
+	spin_lock_irqsave(&Atomic_spin, flags);
+	ret = *ip;
+	(*ip)++;
+	if (*ip == val) *ip = 0;
+	spin_unlock_irqrestore(&Atomic_spin, flags);
+	return ret;
+}
+
+#endif /* __XFS_SUPPORT_ATOMIC_H__ */
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
new file mode 100644
index 000000000000..230d2e74d88d
--- /dev/null
+++ b/fs/xfs/support/debug.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include "debug.h"
+
+#include <asm/page.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+
+int			doass = 1;
+static char		message[256];	/* keep it off the stack */
+static spinlock_t 	xfs_err_lock = SPIN_LOCK_UNLOCKED;
+
+void
+assfail(char *a, char *f, int l)
+{
+    printk("XFS assertion failed: %s, file: %s, line: %d\n", a, f, l);
+    BUG();
+}
+
+#ifdef DEBUG
+
+unsigned long
+random(void)
+{
+	static unsigned long	RandomValue = 1;
+	/* cycles pseudo-randomly through all values between 1 and 2^31 - 2 */
+	register long	rv = RandomValue;
+	register long	lo;
+	register long	hi;
+
+	hi = rv / 127773;
+	lo = rv % 127773;
+	rv = 16807 * lo - 2836 * hi;
+	if( rv <= 0 ) rv += 2147483647;
+	return( RandomValue = rv );
+}
+
+int
+get_thread_id(void)
+{
+	return current->pid;
+}
+
+# define xdprintk(format...)	printk(format)
+#else
+# define xdprintk(format...)	do { } while (0)
+#endif
+
+void
+cmn_err(register int level, char *fmt, ...)
+{
+	char	*fp = fmt;
+	va_list ap;
+
+	spin_lock(&xfs_err_lock);
+	va_start(ap, fmt);
+	if (*fmt == '!') fp++;
+	vsprintf(message, fp, ap);
+	switch (level) {
+	case CE_CONT:
+		printk("%s", message);
+		break;
+	case CE_DEBUG:
+		xdprintk("%s", message);
+		break;
+	default:
+		printk("%s\n", message);
+		break;
+	}
+	va_end(ap);
+	spin_unlock(&xfs_err_lock);
+
+	if (level == CE_PANIC)
+		BUG();
+}
+
+
+void
+icmn_err(register int level, char *fmt, va_list ap)
+{
+	spin_lock(&xfs_err_lock);
+	vsprintf(message, fmt, ap);
+	switch (level) {
+	case CE_CONT:
+		printk("%s", message);
+		break;
+	case CE_DEBUG:
+		xdprintk("%s", message);
+		break;
+	default:
+		printk("cmn_err level %d ", level);
+		printk("%s\n", message);
+		break;
+	}
+	spin_unlock(&xfs_err_lock);
+}
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
new file mode 100644
index 000000000000..0547de8a472b
--- /dev/null
+++ b/fs/xfs/support/debug.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_DEBUG_H__
+#define __XFS_SUPPORT_DEBUG_H__
+
+#include <stdarg.h>
+
+#define CE_DEBUG	7		/* debug	*/
+#define CE_CONT		6		/* continuation */
+#define CE_NOTE		5		/* notice	*/
+#define CE_WARN		4		/* warning	*/
+#define CE_ALERT	1		/* alert	*/
+#define CE_PANIC	0		/* panic	*/
+
+extern void icmn_err(int, char *, va_list);
+extern void cmn_err(int, char *, ...);
+
+#ifdef DEBUG
+# ifdef lint
+#  define ASSERT(EX)	((void)0) /* avoid "constant in conditional" babble */
+# else
+#  define ASSERT(EX) ((!doass||(EX))?((void)0):assfail(#EX, __FILE__, __LINE__))
+# endif /* lint */
+#else
+# define ASSERT(x)	((void)0)
+#endif
+
+extern int doass;		/* dynamically turn off asserts */
+extern void assfail(char *, char *, int);
+#ifdef DEBUG
+extern unsigned long random(void);
+extern int get_thread_id(void);
+#endif
+
+#define ASSERT_ALWAYS(EX)  ((EX)?((void)0):assfail(#EX, __FILE__, __LINE__))
+#define debug_stop_all_cpus(param)	/* param is "cpumask_t *" */
+
+#endif	/* __XFS_SUPPORT_DEBUG_H__ */
diff --git a/fs/xfs/support/kmem.c b/fs/xfs/support/kmem.c
new file mode 100644
index 000000000000..30dcef4318d7
--- /dev/null
+++ b/fs/xfs/support/kmem.c
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include "time.h"
+#include "kmem.h"
+
+#define DEF_PRIORITY	(6)
+#define MAX_SLAB_SIZE	0x10000
+
+static __inline unsigned int flag_convert(int flags)
+{
+#if DEBUG
+	if (unlikely(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS))) {
+		printk(KERN_WARNING
+		    "XFS: memory allocation with wrong flags (%x)\n", flags);
+		BUG();
+	}
+#endif
+
+	if (flags & KM_NOSLEEP)
+		return GFP_ATOMIC;
+	/* If we're in a transaction, FS activity is not ok */
+	else if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+		return GFP_NOFS;
+	else
+		return GFP_KERNEL;
+}
+
+#define MAX_SHAKE 8
+
+static kmem_shake_func_t	shake_list[MAX_SHAKE];
+static DECLARE_MUTEX(shake_sem);
+
+void	kmem_shake_register(kmem_shake_func_t sfunc)
+{
+	int	i;
+
+	down(&shake_sem);
+	for (i = 0; i < MAX_SHAKE; i++) {
+		if (shake_list[i] == NULL) {
+			shake_list[i] = sfunc;
+			break;
+		}
+	}
+	if (i == MAX_SHAKE)
+		BUG();
+	up(&shake_sem);
+}
+
+void	kmem_shake_deregister(kmem_shake_func_t sfunc)
+{
+	int	i;
+
+	down(&shake_sem);
+	for (i = 0; i < MAX_SHAKE; i++) {
+		if (shake_list[i] == sfunc)
+			break;
+	}
+	if (i == MAX_SHAKE)
+		BUG();
+	for (; i < MAX_SHAKE - 1; i++) {
+		shake_list[i] = shake_list[i+1];
+	}
+	shake_list[i] = NULL;
+	up(&shake_sem);
+}
+
+static __inline__ void kmem_shake(void)
+{
+	int	i;
+
+	down(&shake_sem);
+	for (i = 0; i < MAX_SHAKE && shake_list[i]; i++)
+		(*shake_list[i])();
+	up(&shake_sem);
+	delay(10);
+}
+
+void *
+kmem_alloc(size_t size, int flags)
+{
+	int	shrink = DEF_PRIORITY;	/* # times to try to shrink cache */
+	void	*rval;
+
+repeat:
+	if (MAX_SLAB_SIZE < size) {
+		/* Avoid doing filesystem sensitive stuff to get this */
+		rval = __vmalloc(size, flag_convert(flags), PAGE_KERNEL);
+	} else {
+		rval = kmalloc(size, flag_convert(flags));
+	}
+
+	if (rval || (flags & KM_NOSLEEP))
+		return rval;
+
+	/*
+	 * KM_SLEEP callers don't expect a failure
+	 */
+	if (shrink) {
+		kmem_shake();
+
+		shrink--;
+		goto repeat;
+	}
+
+	rval = __vmalloc(size, flag_convert(flags), PAGE_KERNEL);
+	if (!rval && !(flags & KM_SLEEP))
+		panic("kmem_alloc: NULL memory on KM_SLEEP request!");
+
+	return rval;
+}
+
+void *
+kmem_zalloc(size_t size, int flags)
+{
+	void	*ptr;
+
+	ptr = kmem_alloc(size, flags);
+
+	if (ptr)
+		memset((char *)ptr, 0, (int)size);
+
+	return (ptr);
+}
+
+void
+kmem_free(void *ptr, size_t size)
+{
+	if (((unsigned long)ptr < VMALLOC_START) ||
+            ((unsigned long)ptr >= VMALLOC_END)) {
+		kfree(ptr);
+	} else {
+		vfree(ptr);
+	}
+}
+
+void *
+kmem_realloc(void *ptr, size_t newsize, size_t oldsize, int flags)
+{
+	void *new;
+
+	new = kmem_alloc(newsize, flags);
+	if (ptr) {
+		memcpy(new, ptr, ((oldsize < newsize) ? oldsize : newsize));
+		kmem_free(ptr, oldsize);
+	}
+
+	return new;
+}
+
+kmem_zone_t *
+kmem_zone_init(int size, char *zone_name)
+{
+	return kmem_cache_create(zone_name, size, 0, 0, NULL, NULL);
+}
+
+void *
+kmem_zone_alloc(kmem_zone_t *zone, int flags)
+{
+	int	shrink = DEF_PRIORITY;	/* # times to try to shrink cache */
+	void	*ptr = NULL;
+
+repeat:
+	ptr = kmem_cache_alloc(zone, flag_convert(flags));
+
+	if (ptr || (flags & KM_NOSLEEP))
+		return ptr;
+
+	/*
+	 * KM_SLEEP callers don't expect a failure
+	 */
+	if (shrink) {
+		kmem_shake();
+
+		shrink--;
+		goto repeat;
+	}
+
+	if (flags & KM_SLEEP)
+		panic("kmem_zone_alloc: NULL memory on KM_SLEEP request!");
+
+	return NULL;
+}
+
+void *
+kmem_zone_zalloc(kmem_zone_t *zone, int flags)
+{
+	int	shrink = DEF_PRIORITY;	/* # times to try to shrink cache */
+	void	*ptr = NULL;
+
+repeat:
+	ptr = kmem_cache_alloc(zone, flag_convert(flags));
+
+	if (ptr) {
+		memset(ptr, 0, kmem_cache_size(zone));
+		return ptr;
+	}
+
+	if (flags & KM_NOSLEEP)
+		return ptr;
+
+	/*
+	 * KM_SLEEP callers don't expect a failure
+	 */
+	if (shrink) {
+		kmem_shake();
+
+		shrink--;
+		goto repeat;
+	}
+
+	if (flags & KM_SLEEP)
+		panic("kmem_zone_zalloc: NULL memory on KM_SLEEP request!");
+
+	return NULL;
+}
+
+void
+kmem_zone_free(kmem_zone_t *zone, void *ptr)
+{
+	kmem_cache_free(zone, ptr);
+}
diff --git a/fs/xfs/support/kmem.h b/fs/xfs/support/kmem.h
new file mode 100644
index 000000000000..000aa10ee16f
--- /dev/null
+++ b/fs/xfs/support/kmem.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_KMEM_H__
+#define __XFS_SUPPORT_KMEM_H__
+
+#include <linux/slab.h>
+
+/*
+ * memory management routines
+ */
+#define KM_SLEEP	0x0001
+#define KM_NOSLEEP	0x0002
+#define KM_NOFS		0x0004
+
+#define	kmem_zone	kmem_cache_s
+#define kmem_zone_t	kmem_cache_t
+
+extern kmem_zone_t  *kmem_zone_init(int, char *);
+extern void	    *kmem_zone_zalloc(kmem_zone_t *, int);
+extern void	    *kmem_zone_alloc(kmem_zone_t *, int);
+extern void	    kmem_zone_free(kmem_zone_t *, void *);
+
+extern void	    *kmem_alloc(size_t, int);
+extern void	    *kmem_realloc(void *, size_t, size_t, int);
+extern void	    *kmem_zalloc(size_t, int);
+extern void	    kmem_free(void *, size_t);
+
+typedef void	    (*kmem_shake_func_t)(void);
+
+extern void	    kmem_shake_register(kmem_shake_func_t);
+extern void	    kmem_shake_deregister(kmem_shake_func_t);
+
+#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
new file mode 100644
index 000000000000..99743764a503
--- /dev/null
+++ b/fs/xfs/support/ktrace.c
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <xfs_types.h>
+#include "kmem.h"
+#include "spin.h"
+#include "debug.h"
+#include "atomic.h"
+#include "ktrace.h"
+
+#if	(defined(DEBUG) || defined(CONFIG_XFS_VNODE_TRACING))
+
+static kmem_zone_t *ktrace_hdr_zone;
+static kmem_zone_t *ktrace_ent_zone;
+static int	    ktrace_zentries;
+
+void
+ktrace_init(int zentries)
+{
+	ktrace_zentries = zentries;
+
+	ktrace_hdr_zone = kmem_zone_init(sizeof(ktrace_t),
+					"ktrace_hdr");
+	ASSERT(ktrace_hdr_zone);
+
+	ktrace_ent_zone = kmem_zone_init(ktrace_zentries
+					* sizeof(ktrace_entry_t),
+					"ktrace_ent");
+	ASSERT(ktrace_ent_zone);
+}
+
+void
+ktrace_uninit(void)
+{
+	kmem_cache_destroy(ktrace_hdr_zone);
+	kmem_cache_destroy(ktrace_ent_zone);
+}
+
+/*
+ * ktrace_alloc()
+ *
+ * Allocate a ktrace header and enough buffering for the given
+ * number of entries.
+ */
+ktrace_t *
+ktrace_alloc(int nentries, int sleep)
+{
+	ktrace_t	*ktp;
+	ktrace_entry_t	*ktep;
+
+	ktp = (ktrace_t*)kmem_zone_alloc(ktrace_hdr_zone, sleep);
+
+	if (ktp == (ktrace_t*)NULL) {
+		/*
+		 * KM_SLEEP callers don't expect failure.
+		 */
+		if (sleep & KM_SLEEP)
+			panic("ktrace_alloc: NULL memory on KM_SLEEP request!");
+
+		return NULL;
+	}
+
+	/*
+	 * Special treatment for buffers with the ktrace_zentries entries
+	 */
+	if (nentries == ktrace_zentries) {
+		ktep = (ktrace_entry_t*)kmem_zone_zalloc(ktrace_ent_zone,
+							    sleep);
+	} else {
+		ktep = (ktrace_entry_t*)kmem_zalloc((nentries * sizeof(*ktep)),
+							    sleep);
+	}
+
+	if (ktep == NULL) {
+		/*
+		 * KM_SLEEP callers don't expect failure.
+		 */
+		if (sleep & KM_SLEEP)
+			panic("ktrace_alloc: NULL memory on KM_SLEEP request!");
+
+		kmem_free(ktp, sizeof(*ktp));
+
+		return NULL;
+	}
+
+	spinlock_init(&(ktp->kt_lock), "kt_lock");
+
+	ktp->kt_entries	 = ktep;
+	ktp->kt_nentries = nentries;
+	ktp->kt_index	 = 0;
+	ktp->kt_rollover = 0;
+
+	return ktp;
+}
+
+
+/*
+ * ktrace_free()
+ *
+ * Free up the ktrace header and buffer.  It is up to the caller
+ * to ensure that no-one is referencing it.
+ */
+void
+ktrace_free(ktrace_t *ktp)
+{
+	int	entries_size;
+
+	if (ktp == (ktrace_t *)NULL)
+		return;
+
+	spinlock_destroy(&ktp->kt_lock);
+
+	/*
+	 * Special treatment for the Vnode trace buffer.
+	 */
+	if (ktp->kt_nentries == ktrace_zentries) {
+		kmem_zone_free(ktrace_ent_zone, ktp->kt_entries);
+	} else {
+		entries_size = (int)(ktp->kt_nentries * sizeof(ktrace_entry_t));
+
+		kmem_free(ktp->kt_entries, entries_size);
+	}
+
+	kmem_zone_free(ktrace_hdr_zone, ktp);
+}
+
+
+/*
+ * Enter the given values into the "next" entry in the trace buffer.
+ * kt_index is always the index of the next entry to be filled.
+ */
+void
+ktrace_enter(
+	ktrace_t	*ktp,
+	void		*val0,
+	void		*val1,
+	void		*val2,
+	void		*val3,
+	void		*val4,
+	void		*val5,
+	void		*val6,
+	void		*val7,
+	void		*val8,
+	void		*val9,
+	void		*val10,
+	void		*val11,
+	void		*val12,
+	void		*val13,
+	void		*val14,
+	void		*val15)
+{
+	int		index;
+	ktrace_entry_t	*ktep;
+
+	ASSERT(ktp != NULL);
+
+	/*
+	 * Grab an entry by pushing the index up to the next one.
+	 */
+	index = atomicIncWithWrap(&ktp->kt_index, ktp->kt_nentries);
+
+	if (!ktp->kt_rollover && index == ktp->kt_nentries - 1)
+		ktp->kt_rollover = 1;
+
+	ASSERT((index >= 0) && (index < ktp->kt_nentries));
+
+	ktep = &(ktp->kt_entries[index]);
+
+	ktep->val[0]  = val0;
+	ktep->val[1]  = val1;
+	ktep->val[2]  = val2;
+	ktep->val[3]  = val3;
+	ktep->val[4]  = val4;
+	ktep->val[5]  = val5;
+	ktep->val[6]  = val6;
+	ktep->val[7]  = val7;
+	ktep->val[8]  = val8;
+	ktep->val[9]  = val9;
+	ktep->val[10] = val10;
+	ktep->val[11] = val11;
+	ktep->val[12] = val12;
+	ktep->val[13] = val13;
+	ktep->val[14] = val14;
+	ktep->val[15] = val15;
+}
+
+/*
+ * Return the number of entries in the trace buffer.
+ */
+int
+ktrace_nentries(
+	ktrace_t	*ktp)
+{
+	if (ktp == NULL) {
+		return 0;
+	}
+
+	return (ktp->kt_rollover ? ktp->kt_nentries : ktp->kt_index);
+}
+
+
+/*
+ * ktrace_first()
+ *
+ * This is used to find the start of the trace buffer.
+ * In conjunction with ktrace_next() it can be used to
+ * iterate through the entire trace buffer.  This code does
+ * not do any locking because it is assumed that it is called
+ * from the debugger.
+ *
+ * The caller must pass in a pointer to a ktrace_snap
+ * structure in which we will keep some state used to
+ * iterate through the buffer.	This state must not touched
+ * by any code outside of this module.
+ */
+ktrace_entry_t *
+ktrace_first(ktrace_t	*ktp, ktrace_snap_t	*ktsp)
+{
+	ktrace_entry_t	*ktep;
+	int		index;
+	int		nentries;
+
+	if (ktp->kt_rollover)
+		index = ktp->kt_index;
+	else
+		index = 0;
+
+	ktsp->ks_start = index;
+	ktep = &(ktp->kt_entries[index]);
+
+	nentries = ktrace_nentries(ktp);
+	index++;
+	if (index < nentries) {
+		ktsp->ks_index = index;
+	} else {
+		ktsp->ks_index = 0;
+		if (index > nentries)
+			ktep = NULL;
+	}
+	return ktep;
+}
+
+
+/*
+ * ktrace_next()
+ *
+ * This is used to iterate through the entries of the given
+ * trace buffer.  The caller must pass in the ktrace_snap_t
+ * structure initialized by ktrace_first().  The return value
+ * will be either a pointer to the next ktrace_entry or NULL
+ * if all of the entries have been traversed.
+ */
+ktrace_entry_t *
+ktrace_next(
+	ktrace_t	*ktp,
+	ktrace_snap_t	*ktsp)
+{
+	int		index;
+	ktrace_entry_t	*ktep;
+
+	index = ktsp->ks_index;
+	if (index == ktsp->ks_start) {
+		ktep = NULL;
+	} else {
+		ktep = &ktp->kt_entries[index];
+	}
+
+	index++;
+	if (index == ktrace_nentries(ktp)) {
+		ktsp->ks_index = 0;
+	} else {
+		ktsp->ks_index = index;
+	}
+
+	return ktep;
+}
+
+#if	(defined(DEBUG) || defined(CONFIG_XFS_VNODE_TRACING))
+EXPORT_SYMBOL(ktrace_first);
+EXPORT_SYMBOL(ktrace_next);
+#endif
+
+/*
+ * ktrace_skip()
+ *
+ * Skip the next "count" entries and return the entry after that.
+ * Return NULL if this causes us to iterate past the beginning again.
+ */
+
+ktrace_entry_t *
+ktrace_skip(
+	ktrace_t	*ktp,
+	int		count,
+	ktrace_snap_t	*ktsp)
+{
+	int		index;
+	int		new_index;
+	ktrace_entry_t	*ktep;
+	int		nentries = ktrace_nentries(ktp);
+
+	index = ktsp->ks_index;
+	new_index = index + count;
+	while (new_index >= nentries) {
+		new_index -= nentries;
+	}
+	if (index == ktsp->ks_start) {
+		/*
+		 * We've iterated around to the start, so we're done.
+		 */
+		ktep = NULL;
+	} else if ((new_index < index) && (index < ktsp->ks_index)) {
+		/*
+		 * We've skipped past the start again, so we're done.
+		 */
+		ktep = NULL;
+		ktsp->ks_index = ktsp->ks_start;
+	} else {
+		ktep = &(ktp->kt_entries[new_index]);
+		new_index++;
+		if (new_index == nentries) {
+			ktsp->ks_index = 0;
+		} else {
+			ktsp->ks_index = new_index;
+		}
+	}
+	return ktep;
+}
+
+#else
+
+ktrace_t *
+ktrace_alloc(int nentries, int sleep)
+{
+	/*
+	 * KM_SLEEP callers don't expect failure.
+	 */
+	if (sleep & KM_SLEEP)
+		panic("ktrace_alloc: NULL memory on KM_SLEEP request!");
+
+	return NULL;
+}
+#endif
diff --git a/fs/xfs/support/ktrace.h b/fs/xfs/support/ktrace.h
new file mode 100644
index 000000000000..f1ece4a47971
--- /dev/null
+++ b/fs/xfs/support/ktrace.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_KTRACE_H__
+#define __XFS_SUPPORT_KTRACE_H__
+
+
+/*
+ * Trace buffer entry structure.
+ */
+typedef struct ktrace_entry {
+	void	*val[16];
+} ktrace_entry_t;
+
+/*
+ * Trace buffer header structure.
+ */
+typedef struct ktrace {
+	lock_t		kt_lock;	/* mutex to guard counters */
+	int		kt_nentries;	/* number of entries in trace buf */
+	int		kt_index;	/* current index in entries */
+	int		kt_rollover;
+	ktrace_entry_t	*kt_entries;	/* buffer of entries */
+} ktrace_t;
+
+/*
+ * Trace buffer snapshot structure.
+ */
+typedef struct ktrace_snap {
+	int		ks_start;	/* kt_index at time of snap */
+	int		ks_index;	/* current index */
+} ktrace_snap_t;
+
+/*
+ * Exported interfaces.
+ */
+extern ktrace_t *ktrace_alloc(int, int);
+
+#if	(defined(DEBUG) || defined(CONFIG_XFS_VNODE_TRACING))
+
+extern void ktrace_init(int zentries);
+extern void ktrace_uninit(void);
+
+extern void ktrace_free(ktrace_t *);
+
+extern void ktrace_enter(
+	ktrace_t	*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*,
+	void		*);
+
+extern ktrace_entry_t	*ktrace_first(ktrace_t *, ktrace_snap_t *);
+extern int		ktrace_nentries(ktrace_t *);
+extern ktrace_entry_t	*ktrace_next(ktrace_t *, ktrace_snap_t *);
+extern ktrace_entry_t	*ktrace_skip(ktrace_t *, int, ktrace_snap_t *);
+
+#else
+
+#define ktrace_free(ktp)
+#define ktrace_enter(ktp,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15)
+
+#endif
+
+#endif /* __XFS_SUPPORT_KTRACE_H__ */
diff --git a/fs/xfs/support/move.c b/fs/xfs/support/move.c
new file mode 100644
index 000000000000..4fc3831eed38
--- /dev/null
+++ b/fs/xfs/support/move.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <asm/uaccess.h>
+
+#include <xfs_types.h>
+#include "debug.h"
+#include "move.h"
+
+/*
+ * Move "n" bytes at byte address "cp"; "rw" indicates the direction
+ * of the move, and the I/O parameters are provided in "uio", which is
+ * update to reflect the data which was moved.	Returns 0 on success or
+ * a non-zero errno on failure.
+ */
+int
+uiomove(void *cp, size_t n, enum uio_rw rw, struct uio *uio)
+{
+	register struct iovec *iov;
+	u_int cnt;
+	int error;
+
+	while (n > 0 && uio->uio_resid) {
+		iov = uio->uio_iov;
+		cnt = (u_int)iov->iov_len;
+		if (cnt == 0) {
+			uio->uio_iov++;
+			uio->uio_iovcnt--;
+			continue;
+		}
+		if (cnt > n)
+			cnt = (u_int)n;
+		switch (uio->uio_segflg) {
+		case UIO_USERSPACE:
+			if (rw == UIO_READ)
+				error = copy_to_user(iov->iov_base, cp, cnt);
+			else
+				error = copy_from_user(cp, iov->iov_base, cnt);
+			if (error)
+				return EFAULT;
+			break;
+
+
+		case UIO_SYSSPACE:
+			if (rw == UIO_READ)
+				bcopy(cp, iov->iov_base, cnt);
+			else
+				bcopy(iov->iov_base, cp, cnt);
+			break;
+
+		default:
+			ASSERT(0);
+			break;
+		}
+		iov->iov_base = (void *)((char *)iov->iov_base + cnt);
+		iov->iov_len -= cnt;
+		uio->uio_resid -= cnt;
+		uio->uio_offset += cnt;
+		cp = (void *)((char *)cp + cnt);
+		n -= cnt;
+	}
+	return 0;
+}
diff --git a/fs/xfs/support/move.h b/fs/xfs/support/move.h
new file mode 100644
index 000000000000..e01b7b6c7a15
--- /dev/null
+++ b/fs/xfs/support/move.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#ifndef __XFS_SUPPORT_MOVE_H__
+#define __XFS_SUPPORT_MOVE_H__
+
+#include <linux/uio.h>
+#include <asm/uaccess.h>
+
+#define bzero(p,s)	memset((p), 0, (s))
+#define bcopy(s,d,n)	memcpy((d),(s),(n))
+#define bcmp(s1,s2,l)	memcmp(s1,s2,l)
+#define ovbcopy(from,to,count)	memmove(to,from,count)
+
+typedef struct iovec iovec_t;
+
+typedef struct uio {
+	iovec_t		*uio_iov;	/* pointer to array of iovecs */
+	int		uio_iovcnt;	/* number of iovecs */
+	int		uio_fmode;	/* file mode flags */
+	xfs_off_t	uio_offset;	/* file offset */
+	short		uio_segflg;	/* address space (kernel or user) */
+	ssize_t		uio_resid;	/* residual count */
+} uio_t;
+
+/*
+ * I/O direction.
+ */
+typedef enum uio_rw { UIO_READ, UIO_WRITE } uio_rw_t;
+
+/*
+ * Segment flag values.
+ */
+typedef enum uio_seg {
+	UIO_USERSPACE,		/* uio_iov describes user space */
+	UIO_SYSSPACE,		/* uio_iov describes system space */
+} uio_seg_t;
+
+
+extern int	uiomove (void *, size_t, uio_rw_t, uio_t *);
+
+#endif	/* __XFS_SUPPORT_MOVE_H__ */
diff --git a/fs/xfs/support/mrlock.c b/fs/xfs/support/mrlock.c
new file mode 100644
index 000000000000..cb387f171943
--- /dev/null
+++ b/fs/xfs/support/mrlock.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <linux/time.h>
+#include <linux/sched.h>
+#include <asm/system.h>
+#include <linux/interrupt.h>
+#include <asm/current.h>
+
+#include "mrlock.h"
+
+
+#if USE_RW_WAIT_QUEUE_SPINLOCK
+# define wq_write_lock	write_lock
+#else
+# define wq_write_lock	spin_lock
+#endif
+
+/*
+ * We don't seem to need lock_type (only one supported), name, or
+ * sequence. But, XFS will pass it so let's leave them here for now.
+ */
+/* ARGSUSED */
+void
+mrlock_init(mrlock_t *mrp, int lock_type, char *name, long sequence)
+{
+	mrp->mr_count = 0;
+	mrp->mr_reads_waiting = 0;
+	mrp->mr_writes_waiting = 0;
+	init_waitqueue_head(&mrp->mr_readerq);
+	init_waitqueue_head(&mrp->mr_writerq);
+	mrp->mr_lock = SPIN_LOCK_UNLOCKED;
+}
+
+/*
+ * Macros to lock/unlock the mrlock_t.
+ */
+
+#define MRLOCK(m)		spin_lock(&(m)->mr_lock);
+#define MRUNLOCK(m)		spin_unlock(&(m)->mr_lock);
+
+
+/*
+ * lock_wait should never be called in an interrupt thread.
+ *
+ * mrlocks can sleep (i.e. call schedule) and so they can't ever
+ * be called from an interrupt thread.
+ *
+ * threads that wake-up should also never be invoked from interrupt threads.
+ *
+ * But, waitqueue_lock is locked from interrupt threads - and we are
+ * called with interrupts disabled, so it is all OK.
+ */
+
+/* ARGSUSED */
+void
+lock_wait(wait_queue_head_t *q, spinlock_t *lock, int rw)
+{
+	DECLARE_WAITQUEUE( wait, current );
+
+	set_current_state(TASK_UNINTERRUPTIBLE);
+
+	spin_lock(&q->lock);
+	if (rw) {
+		__add_wait_queue_tail(q, &wait);
+	} else {
+		__add_wait_queue(q, &wait);
+	}
+
+	spin_unlock(&q->lock);
+	spin_unlock(lock);
+
+	schedule();
+
+	set_current_state(TASK_RUNNING);
+
+	spin_lock(&q->lock);
+	__remove_wait_queue(q, &wait);
+	spin_unlock(&q->lock);
+
+	spin_lock(lock);
+
+	/* return with lock held */
+}
+
+/* ARGSUSED */
+void
+mrfree(mrlock_t *mrp)
+{
+}
+
+/* ARGSUSED */
+void
+mrlock(mrlock_t *mrp, int type, int flags)
+{
+	if (type == MR_ACCESS)
+		mraccess(mrp);
+	else
+		mrupdate(mrp);
+}
+
+/* ARGSUSED */
+void
+mraccessf(mrlock_t *mrp, int flags)
+{
+	MRLOCK(mrp);
+	if(mrp->mr_writes_waiting > 0) {
+		mrp->mr_reads_waiting++;
+		lock_wait(&mrp->mr_readerq, &mrp->mr_lock, 0);
+		mrp->mr_reads_waiting--;
+	}
+	while (mrp->mr_count < 0) {
+		mrp->mr_reads_waiting++;
+		lock_wait(&mrp->mr_readerq, &mrp->mr_lock, 0);
+		mrp->mr_reads_waiting--;
+	}
+	mrp->mr_count++;
+	MRUNLOCK(mrp);
+}
+
+/* ARGSUSED */
+void
+mrupdatef(mrlock_t *mrp, int flags)
+{
+	MRLOCK(mrp);
+	while(mrp->mr_count) {
+		mrp->mr_writes_waiting++;
+		lock_wait(&mrp->mr_writerq, &mrp->mr_lock, 1);
+		mrp->mr_writes_waiting--;
+	}
+
+	mrp->mr_count = -1; /* writer on it */
+	MRUNLOCK(mrp);
+}
+
+int
+mrtryaccess(mrlock_t *mrp)
+{
+	MRLOCK(mrp);
+	/*
+	 * If anyone is waiting for update access or the lock is held for update
+	 * fail the request.
+	 */
+	if(mrp->mr_writes_waiting > 0 || mrp->mr_count < 0) {
+		MRUNLOCK(mrp);
+		return 0;
+	}
+	mrp->mr_count++;
+	MRUNLOCK(mrp);
+	return 1;
+}
+
+int
+mrtrypromote(mrlock_t *mrp)
+{
+	MRLOCK(mrp);
+
+	if(mrp->mr_count == 1) { /* We are the only thread with the lock */
+		mrp->mr_count = -1; /* writer on it */
+		MRUNLOCK(mrp);
+		return 1;
+	}
+
+	MRUNLOCK(mrp);
+	return 0;
+}
+
+int
+mrtryupdate(mrlock_t *mrp)
+{
+	MRLOCK(mrp);
+
+	if(mrp->mr_count) {
+		MRUNLOCK(mrp);
+		return 0;
+	}
+
+	mrp->mr_count = -1; /* writer on it */
+	MRUNLOCK(mrp);
+	return 1;
+}
+
+static __inline__ void mrwake(mrlock_t *mrp)
+{
+	/*
+	 * First, if the count is now 0, we need to wake-up anyone waiting.
+	 */
+	if (!mrp->mr_count) {
+		if (mrp->mr_writes_waiting) {	/* Wake-up first writer waiting */
+			wake_up(&mrp->mr_writerq);
+		} else if (mrp->mr_reads_waiting) {	/* Wakeup any readers waiting */
+			wake_up(&mrp->mr_readerq);
+		}
+	}
+}
+
+void
+mraccunlock(mrlock_t *mrp)
+{
+	MRLOCK(mrp);
+	mrp->mr_count--;
+	mrwake(mrp);
+	MRUNLOCK(mrp);
+}
+
+void
+mrunlock(mrlock_t *mrp)
+{
+	MRLOCK(mrp);
+	if (mrp->mr_count < 0) {
+		mrp->mr_count = 0;
+	} else {
+		mrp->mr_count--;
+	}
+	mrwake(mrp);
+	MRUNLOCK(mrp);
+}
+
+int
+ismrlocked(mrlock_t *mrp, int type)	/* No need to lock since info can change */
+{
+	if (type == MR_ACCESS)
+		return (mrp->mr_count > 0); /* Read lock */
+	else if (type == MR_UPDATE)
+		return (mrp->mr_count < 0); /* Write lock */
+	else if (type == (MR_UPDATE | MR_ACCESS))
+		return (mrp->mr_count); /* Any type of lock held */
+	else /* Any waiters */
+		return (mrp->mr_reads_waiting | mrp->mr_writes_waiting);
+}
+
+/*
+ * Demote from update to access. We better be the only thread with the
+ * lock in update mode so it should be easy to set to 1.
+ * Wake-up any readers waiting.
+ */
+
+void
+mrdemote(mrlock_t *mrp)
+{
+	MRLOCK(mrp);
+	mrp->mr_count = 1;
+	if (mrp->mr_reads_waiting) {	/* Wakeup all readers waiting */
+		wake_up(&mrp->mr_readerq);
+	}
+	MRUNLOCK(mrp);
+}
+
+int
+mrislocked_access(mrlock_t *mrp)
+{
+	return(mrp->mr_count > 0);
+}
+
+int
+mrislocked_update(mrlock_t *mrp)
+{
+	return(mrp->mr_count < 0);
+}
diff --git a/fs/xfs/support/mrlock.h b/fs/xfs/support/mrlock.h
new file mode 100644
index 000000000000..7004e23b31e8
--- /dev/null
+++ b/fs/xfs/support/mrlock.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_MRLOCK_H__
+#define __XFS_SUPPORT_MRLOCK_H__
+
+#include <linux/version.h>
+#include <linux/time.h>
+#include <linux/wait.h>
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+
+/*
+ * Implement mrlocks on Linux that work for XFS.
+ *
+ * These are sleep locks and not spinlocks. If one wants read/write spinlocks,
+ * use read_lock, write_lock, ... see spinlock.h.
+ */
+
+typedef struct mrlock_s {
+	int			mr_count;
+	unsigned short		mr_reads_waiting;
+	unsigned short		mr_writes_waiting;
+	wait_queue_head_t	mr_readerq;
+	wait_queue_head_t	mr_writerq;
+	spinlock_t		mr_lock;
+} mrlock_t;
+
+#define MR_ACCESS	1
+#define MR_UPDATE	2
+
+#define MRLOCK_BARRIER		0x1
+#define MRLOCK_ALLOW_EQUAL_PRI	0x8
+
+/*
+ * mraccessf/mrupdatef take flags to be passed in while sleeping;
+ * only PLTWAIT is currently supported.
+ */
+
+extern void	mraccessf(mrlock_t *, int);
+extern void	mrupdatef(mrlock_t *, int);
+extern void	mrlock(mrlock_t *, int, int);
+extern void	mrunlock(mrlock_t *);
+extern void	mraccunlock(mrlock_t *);
+extern int	mrtryupdate(mrlock_t *);
+extern int	mrtryaccess(mrlock_t *);
+extern int	mrtrypromote(mrlock_t *);
+extern void	mrdemote(mrlock_t *);
+
+extern int	ismrlocked(mrlock_t *, int);
+extern void	mrlock_init(mrlock_t *, int type, char *name, long sequence);
+extern void	mrfree(mrlock_t *);
+
+#define mrinit(mrp, name)	mrlock_init(mrp, MRLOCK_BARRIER, name, -1)
+#define mraccess(mrp)	mraccessf(mrp, 0)	/* grab for READ/ACCESS */
+#define mrupdate(mrp)	mrupdatef(mrp, 0)	/* grab for WRITE/UPDATE */
+
+#endif /* __XFS_SUPPORT_MRLOCK_H__ */
diff --git a/fs/xfs/support/mutex.h b/fs/xfs/support/mutex.h
new file mode 100644
index 000000000000..346b2d663a08
--- /dev/null
+++ b/fs/xfs/support/mutex.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ * Portions Copyright (c) 2002 Christoph Hellwig.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_MUTEX_H__
+#define __XFS_SUPPORT_MUTEX_H__
+
+#include <linux/spinlock.h>
+#include <asm/semaphore.h>
+
+/*
+ * Map the mutex'es from IRIX to Linux semaphores.
+ *
+ * Destroy just simply initializes to -99 which should block all other
+ * callers.
+ */
+#define MUTEX_DEFAULT		0x0
+typedef struct semaphore	mutex_t;
+
+#define mutex_init(lock, type, name)		sema_init(lock, 1)
+#define init_mutex(ptr, type, name, sequence)	sema_init(lock, 1)
+#define mutex_destroy(lock)			sema_init(lock, -99)
+#define mutex_lock(lock, num)			down(lock)
+#define mutex_trylock(lock)			(down_trylock(lock) ? 0 : 1)
+#define mutex_unlock(lock)			up(lock)
+
+#endif /* __XFS_SUPPORT_MUTEX_H__ */
diff --git a/fs/xfs/support/qsort.c b/fs/xfs/support/qsort.c
new file mode 100644
index 000000000000..5a91696ba75c
--- /dev/null
+++ b/fs/xfs/support/qsort.c
@@ -0,0 +1,243 @@
+/* Copyright (C) 1991, 1992, 1996, 1997, 1999 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Written by Douglas C. Schmidt (schmidt@ics.uci.edu).
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+/* If you consider tuning this algorithm, you should consult first:
+   Engineering a sort function; Jon Bentley and M. Douglas McIlroy;
+   Software - Practice and Experience; Vol. 23 (11), 1249-1265, 1993.  */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+/* Byte-wise swap two items of size SIZE. */
+#define SWAP(a, b, size)						      \
+  do									      \
+    {									      \
+      register size_t __size = (size);					      \
+      register char *__a = (a), *__b = (b);				      \
+      do								      \
+	{								      \
+	  char __tmp = *__a;						      \
+	  *__a++ = *__b;						      \
+	  *__b++ = __tmp;						      \
+	} while (--__size > 0);						      \
+    } while (0)
+
+/* Discontinue quicksort algorithm when partition gets below this size.
+   This particular magic number was chosen to work best on a Sun 4/260. */
+#define MAX_THRESH 4
+
+/* Stack node declarations used to store unfulfilled partition obligations. */
+typedef struct
+  {
+    char *lo;
+    char *hi;
+  } stack_node;
+
+/* The next 4 #defines implement a very fast in-line stack abstraction. */
+/* The stack needs log (total_elements) entries (we could even subtract
+   log(MAX_THRESH)).  Since total_elements has type size_t, we get as
+   upper bound for log (total_elements):
+   bits per byte (CHAR_BIT) * sizeof(size_t).  */
+#define STACK_SIZE	(8 * sizeof(unsigned long int))
+#define PUSH(low, high) ((void) ((top->lo = (low)), (top->hi = (high)), ++top))
+#define POP(low, high)	((void) (--top, (low = top->lo), (high = top->hi)))
+#define STACK_NOT_EMPTY (stack < top)
+
+
+/* Order size using quicksort.	This implementation incorporates
+   four optimizations discussed in Sedgewick:
+
+   1. Non-recursive, using an explicit stack of pointer that store the
+      next array partition to sort.  To save time, this maximum amount
+      of space required to store an array of SIZE_MAX is allocated on the
+      stack.  Assuming a 32-bit (64 bit) integer for size_t, this needs
+      only 32 * sizeof(stack_node) == 256 bytes (for 64 bit: 1024 bytes).
+      Pretty cheap, actually.
+
+   2. Chose the pivot element using a median-of-three decision tree.
+      This reduces the probability of selecting a bad pivot value and
+      eliminates certain extraneous comparisons.
+
+   3. Only quicksorts TOTAL_ELEMS / MAX_THRESH partitions, leaving
+      insertion sort to order the MAX_THRESH items within each partition.
+      This is a big win, since insertion sort is faster for small, mostly
+      sorted array segments.
+
+   4. The larger of the two sub-partitions is always pushed onto the
+      stack first, with the algorithm then concentrating on the
+      smaller partition.  This *guarantees* no more than log (total_elems)
+      stack size is needed (actually O(1) in this case)!  */
+
+void
+qsort (void *const pbase, size_t total_elems, size_t size,
+       int (*cmp)(const void *, const void *))
+{
+  register char *base_ptr = (char *) pbase;
+
+  const size_t max_thresh = MAX_THRESH * size;
+
+  if (total_elems == 0)
+    /* Avoid lossage with unsigned arithmetic below.  */
+    return;
+
+  if (total_elems > MAX_THRESH)
+    {
+      char *lo = base_ptr;
+      char *hi = &lo[size * (total_elems - 1)];
+      stack_node stack[STACK_SIZE];
+      stack_node *top = stack + 1;
+
+      while (STACK_NOT_EMPTY)
+	{
+	  char *left_ptr;
+	  char *right_ptr;
+
+	  /* Select median value from among LO, MID, and HI. Rearrange
+	     LO and HI so the three values are sorted. This lowers the
+	     probability of picking a pathological pivot value and
+	     skips a comparison for both the LEFT_PTR and RIGHT_PTR in
+	     the while loops. */
+
+	  char *mid = lo + size * ((hi - lo) / size >> 1);
+
+	  if ((*cmp) ((void *) mid, (void *) lo) < 0)
+	    SWAP (mid, lo, size);
+	  if ((*cmp) ((void *) hi, (void *) mid) < 0)
+	    SWAP (mid, hi, size);
+	  else
+	    goto jump_over;
+	  if ((*cmp) ((void *) mid, (void *) lo) < 0)
+	    SWAP (mid, lo, size);
+	jump_over:;
+
+	  left_ptr  = lo + size;
+	  right_ptr = hi - size;
+
+	  /* Here's the famous ``collapse the walls'' section of quicksort.
+	     Gotta like those tight inner loops!  They are the main reason
+	     that this algorithm runs much faster than others. */
+	  do
+	    {
+	      while ((*cmp) ((void *) left_ptr, (void *) mid) < 0)
+		left_ptr += size;
+
+	      while ((*cmp) ((void *) mid, (void *) right_ptr) < 0)
+		right_ptr -= size;
+
+	      if (left_ptr < right_ptr)
+		{
+		  SWAP (left_ptr, right_ptr, size);
+		  if (mid == left_ptr)
+		    mid = right_ptr;
+		  else if (mid == right_ptr)
+		    mid = left_ptr;
+		  left_ptr += size;
+		  right_ptr -= size;
+		}
+	      else if (left_ptr == right_ptr)
+		{
+		  left_ptr += size;
+		  right_ptr -= size;
+		  break;
+		}
+	    }
+	  while (left_ptr <= right_ptr);
+
+	  /* Set up pointers for next iteration.  First determine whether
+	     left and right partitions are below the threshold size.  If so,
+	     ignore one or both.  Otherwise, push the larger partition's
+	     bounds on the stack and continue sorting the smaller one. */
+
+	  if ((size_t) (right_ptr - lo) <= max_thresh)
+	    {
+	      if ((size_t) (hi - left_ptr) <= max_thresh)
+		/* Ignore both small partitions. */
+		POP (lo, hi);
+	      else
+		/* Ignore small left partition. */
+		lo = left_ptr;
+	    }
+	  else if ((size_t) (hi - left_ptr) <= max_thresh)
+	    /* Ignore small right partition. */
+	    hi = right_ptr;
+	  else if ((right_ptr - lo) > (hi - left_ptr))
+	    {
+	      /* Push larger left partition indices. */
+	      PUSH (lo, right_ptr);
+	      lo = left_ptr;
+	    }
+	  else
+	    {
+	      /* Push larger right partition indices. */
+	      PUSH (left_ptr, hi);
+	      hi = right_ptr;
+	    }
+	}
+    }
+
+  /* Once the BASE_PTR array is partially sorted by quicksort the rest
+     is completely sorted using insertion sort, since this is efficient
+     for partitions below MAX_THRESH size. BASE_PTR points to the beginning
+     of the array to sort, and END_PTR points at the very last element in
+     the array (*not* one beyond it!). */
+  {
+    char *const end_ptr = &base_ptr[size * (total_elems - 1)];
+    char *tmp_ptr = base_ptr;
+    char *thresh = min(end_ptr, base_ptr + max_thresh);
+    register char *run_ptr;
+
+    /* Find smallest element in first threshold and place it at the
+       array's beginning.  This is the smallest array element,
+       and the operation speeds up insertion sort's inner loop. */
+
+    for (run_ptr = tmp_ptr + size; run_ptr <= thresh; run_ptr += size)
+      if ((*cmp) ((void *) run_ptr, (void *) tmp_ptr) < 0)
+	tmp_ptr = run_ptr;
+
+    if (tmp_ptr != base_ptr)
+      SWAP (tmp_ptr, base_ptr, size);
+
+    /* Insertion sort, running from left-hand-side up to right-hand-side.  */
+
+    run_ptr = base_ptr + size;
+    while ((run_ptr += size) <= end_ptr)
+      {
+	tmp_ptr = run_ptr - size;
+	while ((*cmp) ((void *) run_ptr, (void *) tmp_ptr) < 0)
+	  tmp_ptr -= size;
+
+	tmp_ptr += size;
+	if (tmp_ptr != run_ptr)
+	  {
+	    char *trav;
+
+	    trav = run_ptr + size;
+	    while (--trav >= run_ptr)
+	      {
+		char c = *trav;
+		char *hi, *lo;
+
+		for (hi = lo = trav; (lo -= size) >= tmp_ptr; hi = lo)
+		  *hi = *lo;
+		*hi = c;
+	      }
+	  }
+      }
+  }
+}
diff --git a/fs/xfs/support/qsort.h b/fs/xfs/support/qsort.h
new file mode 100644
index 000000000000..cf2c5c233650
--- /dev/null
+++ b/fs/xfs/support/qsort.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#ifndef QSORT_H
+#define QSORT_H
+
+extern void qsort (void *const pbase,
+		    size_t total_elems,
+		    size_t size,
+		    int (*cmp)(const void *, const void *));
+
+#endif
diff --git a/fs/xfs/support/sema.h b/fs/xfs/support/sema.h
new file mode 100644
index 000000000000..8f053b307522
--- /dev/null
+++ b/fs/xfs/support/sema.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_SEMA_H__
+#define __XFS_SUPPORT_SEMA_H__
+
+#include <linux/version.h>
+#include <linux/time.h>
+#include <linux/wait.h>
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+
+/*
+ * sema_t structure just maps to struct semaphore in Linux kernel.
+ */
+
+typedef struct semaphore sema_t;
+
+#define init_sema(sp, val, c, d)	sema_init(sp, val)
+#define initsema(sp, val)		sema_init(sp, val)
+#define initnsema(sp, val, name)	sema_init(sp, val)
+#define psema(sp, b)			down(sp)
+#define vsema(sp)			up(sp)
+#define valusema(sp)			(atomic_read(&(sp)->count))
+#define freesema(sema)
+
+/*
+ * Map cpsema (try to get the sema) to down_trylock. We need to switch
+ * the return values since cpsema returns 1 (acquired) 0 (failed) and
+ * down_trylock returns the reverse 0 (acquired) 1 (failed).
+ */
+
+#define cpsema(sp)			(down_trylock(sp) ? 0 : 1)
+
+/*
+ * Didn't do cvsema(sp). Not sure how to map this to up/down/...
+ * It does a vsema if the values is < 0 other wise nothing.
+ */
+
+#endif /* __XFS_SUPPORT_SEMA_H__ */
diff --git a/fs/xfs/support/spin.h b/fs/xfs/support/spin.h
new file mode 100644
index 000000000000..f28f251ce8c3
--- /dev/null
+++ b/fs/xfs/support/spin.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ * Portions Copyright (c) 2002 Christoph Hellwig.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_SPIN_H__
+#define __XFS_SUPPORT_SPIN_H__
+
+#include <linux/sched.h>	/* preempt needs this */
+#include <linux/spinlock.h>
+
+/*
+ * Map lock_t from IRIX to Linux spinlocks.
+ *
+ * Note that linux turns on/off spinlocks depending on CONFIG_SMP.
+ * We don't need to worry about SMP or not here.
+ */
+
+typedef spinlock_t lock_t;
+
+#define spinlock_init(lock, name)	spin_lock_init(lock)
+#define init_spinlock(lock, name, ll)	spin_lock_init(lock)
+#define spinlock_destroy(lock)
+
+static inline unsigned long mutex_spinlock(lock_t *lock)
+{
+	spin_lock(lock);
+	return 0;
+}
+
+/*ARGSUSED*/
+static inline void mutex_spinunlock(lock_t *lock, unsigned long s)
+{
+	spin_unlock(lock);
+}
+
+static inline void nested_spinlock(lock_t *lock)
+{
+	spin_lock(lock);
+}
+
+static inline void nested_spinunlock(lock_t *lock)
+{
+	spin_unlock(lock);
+}
+
+#endif /* __XFS_SUPPORT_SPIN_H__ */
diff --git a/fs/xfs/support/sv.h b/fs/xfs/support/sv.h
new file mode 100644
index 000000000000..7bd2986c80b8
--- /dev/null
+++ b/fs/xfs/support/sv.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ * Portions Copyright (c) 2002 Christoph Hellwig.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_SV_H__
+#define __XFS_SUPPORT_SV_H__
+
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+
+/*
+ * Synchronisation variables.
+ *
+ * (Parameters "pri", "svf" and "rts" are not implemented)
+ */
+
+typedef struct sv_s {
+	wait_queue_head_t waiters;
+} sv_t;
+
+#define SV_FIFO		0x0		/* sv_t is FIFO type */
+#define SV_LIFO		0x2		/* sv_t is LIFO type */
+#define SV_PRIO		0x4		/* sv_t is PRIO type */
+#define SV_KEYED	0x6		/* sv_t is KEYED type */
+#define SV_DEFAULT	SV_FIFO
+
+
+static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
+			     unsigned long timeout)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue_exclusive(&sv->waiters, &wait);
+	set_current_state(state);
+	spin_unlock(lock);
+
+	schedule_timeout(timeout);
+
+	set_current_state(TASK_RUNNING);
+	remove_wait_queue(&sv->waiters, &wait);
+}
+
+#define init_sv(sv,type,name,flag) \
+	init_waitqueue_head(&(sv)->waiters)
+#define sv_init(sv,flag,name) \
+	init_waitqueue_head(&(sv)->waiters)
+#define sv_destroy(sv) \
+	/*NOTHING*/
+#define sv_wait(sv, pri, lock, s) \
+	_sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
+#define sv_wait_sig(sv, pri, lock, s)	\
+	_sv_wait(sv, lock, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
+#define sv_timedwait(sv, pri, lock, s, svf, ts, rts) \
+	_sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, timespec_to_jiffies(ts))
+#define sv_timedwait_sig(sv, pri, lock, s, svf, ts, rts) \
+	_sv_wait(sv, lock, TASK_INTERRUPTIBLE, timespec_to_jiffies(ts))
+#define sv_signal(sv) \
+	wake_up(&(sv)->waiters)
+#define sv_broadcast(sv) \
+	wake_up_all(&(sv)->waiters)
+
+#endif /* __XFS_SUPPORT_SV_H__ */
diff --git a/fs/xfs/support/time.h b/fs/xfs/support/time.h
new file mode 100644
index 000000000000..f48b48e6fea4
--- /dev/null
+++ b/fs/xfs/support/time.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_TIME_H__
+#define __XFS_SUPPORT_TIME_H__
+
+#include <linux/sched.h>
+
+static inline void delay(long ticks)
+{
+	current->state = TASK_UNINTERRUPTIBLE;
+	schedule_timeout(ticks);
+}
+
+static inline void nanotime(struct timespec *tvp)
+{
+	tvp->tv_sec = xtime.tv_sec;
+	tvp->tv_nsec = xtime.tv_usec * 1000;
+}
+
+#endif /* __XFS_SUPPORT_TIME_H__ */
diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c
new file mode 100644
index 000000000000..f30b857a3f9a
--- /dev/null
+++ b/fs/xfs/support/uuid.c
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <linux/types.h>
+#include <linux/random.h>
+#include <linux/time.h>
+
+#ifdef __sparc__
+#include <asm/idprom.h>
+#else
+#include <linux/netdevice.h>
+#endif
+
+#include <xfs_types.h>
+#include <xfs_arch.h>
+#include "time.h"
+#include "move.h"
+#include "uuid.h"
+
+#ifndef CONFIG_NET
+#define dev_get_by_name(x)	(NULL)
+#define dev_put(x)		do { } while (0)
+#endif
+
+/* NODE_SIZE is the number of bytes used for the node identifier portion. */
+#define NODE_SIZE	6
+
+/*
+ * Total size must be 128 bits.	 N.B. definition of uuid_t in uuid.h!
+ */
+typedef struct {
+	u_int32_t	uu_timelow;	/* time "low" */
+	u_int16_t	uu_timemid;	/* time "mid" */
+	u_int16_t	uu_timehi;	/* time "hi" and version */
+	u_int16_t	uu_clockseq;	/* "reserved" and clock sequence */
+	u_int16_t	uu_node[NODE_SIZE / 2]; /* ethernet hardware address */
+} uu_t;
+
+/*
+ * The Time Base Correction is the amount to add on to a UNIX-based
+ * time value (i.e. seconds since 1 Jan. 1970) to convert it to the
+ * time base for UUIDs (15 Oct. 1582).
+ */
+#define UUID_TBC	0x01B21DD2138140LL
+
+static	short		uuid_eaddr[NODE_SIZE / 2];	/* ethernet address */
+static	__int64_t	uuid_time;	/* last time basis used */
+static	u_int16_t	uuid_clockseq;	/* boot-time randomizer */
+DECLARE_MUTEX(uuid_lock);
+
+/*
+ * uuid_init - called from out of init_tbl[]
+ */
+void
+uuid_init(void)
+{
+}
+
+/*
+ * uuid_getnodeuniq - obtain the node unique fields of a UUID.
+ *
+ * This is not in any way a standard or condoned UUID function;
+ * it just something that's needed for user-level file handles.
+ */
+void
+uuid_getnodeuniq(uuid_t *uuid, int fsid [2])
+{
+	char	*uu=(char*)uuid;
+
+	/* on IRIX, this function assumes big-endian fields within
+	 * the uuid, so we use INT_GET to get the same result on
+	 * little-endian systems
+	 */
+
+	fsid[0] = (INT_GET(*(u_int16_t*)(uu+8), ARCH_CONVERT) << 16) +
+		   INT_GET(*(u_int16_t*)(uu+4), ARCH_CONVERT);
+	fsid[1] =  INT_GET(*(u_int32_t*)(uu  ), ARCH_CONVERT);
+}
+
+void
+uuid_create_nil(uuid_t *uuid)
+{
+	bzero(uuid, sizeof *uuid);
+}
+
+int
+uuid_is_nil(uuid_t *uuid)
+{
+	int	i;
+	char	*cp = (char *)uuid;
+
+	if (uuid == NULL)
+		return B_TRUE;
+	/* implied check of version number here... */
+	for (i = 0; i < sizeof *uuid; i++)
+		if (*cp++) return B_FALSE;	/* not nil */
+	return B_TRUE;	/* is nil */
+}
+
+int
+uuid_equal(uuid_t *uuid1, uuid_t *uuid2)
+{
+	return bcmp(uuid1, uuid2, sizeof(uuid_t)) ? B_FALSE : B_TRUE;
+}
+
+/*
+ * Given a 128-bit uuid, return a 64-bit value by adding the top and bottom
+ * 64-bit words.  NOTE: This function can not be changed EVER.	Although
+ * brain-dead, some applications depend on this 64-bit value remaining
+ * persistent.	Specifically, DMI vendors store the value as a persistent
+ * filehandle.
+ */
+__uint64_t
+uuid_hash64(uuid_t *uuid)
+{
+	__uint64_t	*sp = (__uint64_t *)uuid;
+
+	return sp[0] + sp[1];
+}	/* uuid_hash64 */
+
+static void
+get_eaddr(char *junk)
+{
+#ifdef __sparc__
+	memcpy(uuid_eaddr, idprom->id_ethaddr, 6);
+#else
+	struct net_device *dev;
+
+	dev = dev_get_by_name("eth0");
+	if (!dev || !dev->addr_len) {
+		get_random_bytes(uuid_eaddr, sizeof(uuid_eaddr));
+	} else {
+		memcpy(uuid_eaddr, dev->dev_addr,
+			dev->addr_len<sizeof(uuid_eaddr)?
+			dev->addr_len:sizeof(uuid_eaddr));
+		dev_put(dev);
+	}
+#endif
+}
+
+/*
+ * uuid_create - kernel version, does the actual work
+ */
+void
+uuid_create(uuid_t *uuid)
+{
+	int		i;
+	uu_t		*uu = (uu_t *)uuid;
+	static int	uuid_have_eaddr = 0;	/* ethernet addr inited? */
+	static int	uuid_is_init = 0;	/* time/clockseq inited? */
+
+	down(&uuid_lock);
+	if (!uuid_is_init) {
+		timespec_t	ts;
+
+		nanotime(&ts);
+		/*
+		 * The clock sequence must be initialized randomly.
+		 */
+		uuid_clockseq = ((unsigned long)jiffies & 0xfff) | 0x8000;
+		/*
+		 * Initialize the uuid time, it's in 100 nanosecond
+		 * units since a time base in 1582.
+		 */
+		uuid_time = ts.tv_sec * 10000000LL +
+			    ts.tv_nsec / 100LL +
+			    UUID_TBC;
+		uuid_is_init = 1;
+	}
+	if (!uuid_have_eaddr) {
+		uuid_have_eaddr = 1;
+		get_eaddr((char *)uuid_eaddr);
+	}
+	uuid_time++;
+	uu->uu_timelow = (u_int32_t)(uuid_time & 0x00000000ffffffffLL);
+	uu->uu_timemid = (u_int16_t)((uuid_time >> 32) & 0x0000ffff);
+	uu->uu_timehi = (u_int16_t)((uuid_time >> 48) & 0x00000fff) | 0x1000;
+	up(&uuid_lock);
+	uu->uu_clockseq = uuid_clockseq;
+	for (i = 0; i < (NODE_SIZE / 2); i++)
+		uu->uu_node [i] = uuid_eaddr [i];
+}
+
+int
+uuid_compare(uuid_t *uuid1, uuid_t *uuid2)
+{
+	int	i;
+	char	*cp1 = (char *) uuid1;
+	char	*cp2 = (char *) uuid2;
+
+	if (uuid1 == NULL) {
+		if (uuid2 == NULL) {
+			return 0;	/* equal because both are nil */
+		} else	{
+			return -1;	/* uuid1 nil, so precedes uuid2 */
+		}
+	} else if (uuid2 == NULL) {
+		return 1;
+	}
+
+	/* implied check of version number here... */
+	for (i = 0; i < sizeof(uuid_t); i++) {
+		if (*cp1 < *cp2)
+			return -1;
+		if (*cp1++ > *cp2++)
+			return 1;
+	}
+	return 0;	/* they're equal */
+}
diff --git a/fs/xfs/support/uuid.h b/fs/xfs/support/uuid.h
new file mode 100644
index 000000000000..e1fed2827475
--- /dev/null
+++ b/fs/xfs/support/uuid.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SUPPORT_UUID_H__
+#define __XFS_SUPPORT_UUID_H__
+
+void uuid_create_nil(uuid_t *uuid);
+int uuid_is_nil(uuid_t *uuid);
+int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
+void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
+__uint64_t uuid_hash64(uuid_t *uuid);
+
+#endif	/* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
new file mode 100644
index 000000000000..b30e56c953fe
--- /dev/null
+++ b/fs/xfs/xfs.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_H__
+#define __XFS_H__
+
+#include <linux/config.h>
+
+#include <xfs_types.h>
+#include <xfs_arch.h>
+
+#include <support/kmem.h>
+#include <support/mrlock.h>
+#include <support/qsort.h>
+#include <support/spin.h>
+#include <support/sv.h>
+#include <support/ktrace.h>
+#include <support/mutex.h>
+#include <support/sema.h>
+#include <support/atomic.h>
+#include <support/debug.h>
+#include <support/move.h>
+#include <support/uuid.h>
+#include <support/time.h>
+
+#include <linux/xfs_linux.h>
+
+#include <xfs_fs.h>
+#include <xfs_buf.h>
+#include <xfs_macros.h>
+#include <xfs_inum.h>
+#include <xfs_log.h>
+#include <xfs_clnt.h>
+#include <xfs_trans.h>
+#include <xfs_sb.h>
+#include <xfs_ag.h>
+#include <xfs_dir.h>
+#include <xfs_dir2.h>
+#include <xfs_imap.h>
+#include <xfs_mount.h>
+#include <xfs_alloc_btree.h>
+#include <xfs_bmap_btree.h>
+#include <xfs_ialloc_btree.h>
+#include <xfs_btree.h>
+#include <xfs_ialloc.h>
+#include <xfs_attr_sf.h>
+#include <xfs_dir_sf.h>
+#include <xfs_dir2_sf.h>
+#include <xfs_dinode.h>
+#include <xfs_inode.h>
+#include <xfs_alloc.h>
+#include <xfs_bmap.h>
+#include <xfs_bit.h>
+#include <xfs_rtalloc.h>
+#include <xfs_error.h>
+#include <xfs_quota.h>
+#include <xfs_itable.h>
+#include <xfs_dqblk.h>
+#include <xfs_dquot_item.h>
+#include <xfs_dquot.h>
+#include <xfs_qm.h>
+#include <xfs_rw.h>
+#include <xfs_da_btree.h>
+#include <xfs_dir_leaf.h>
+#include <xfs_dir2_data.h>
+#include <xfs_dir2_leaf.h>
+#include <xfs_dir2_block.h>
+#include <xfs_dir2_node.h>
+#include <xfs_dir2_trace.h>
+#include <xfs_acl.h>
+#include <xfs_cap.h>
+#include <xfs_mac.h>
+#include <xfs_attr.h>
+#include <xfs_attr_leaf.h>
+#include <xfs_inode_item.h>
+#include <xfs_buf_item.h>
+#include <xfs_extfree_item.h>
+#include <xfs_log_priv.h>
+#include <xfs_trans_priv.h>
+#include <xfs_trans_space.h>
+#include <xfs_utils.h>
+#include <xfs_dmapi.h>
+
+#include <linux/xfs_lrw.h>
+
+#endif	/* __XFS_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
new file mode 100644
index 000000000000..b8c496b1fdea
--- /dev/null
+++ b/fs/xfs/xfs_acl.c
@@ -0,0 +1,893 @@
+/*
+ * Copyright (c) 2001-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <linux/posix_acl_xattr.h>
+
+STATIC int	xfs_acl_setmode(vnode_t *, xfs_acl_t *);
+STATIC void	xfs_acl_filter_mode(mode_t, xfs_acl_t *);
+STATIC void	xfs_acl_get_endian(xfs_acl_t *);
+STATIC int	xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *);
+STATIC int	xfs_acl_invalid(xfs_acl_t *);
+STATIC void	xfs_acl_sync_mode(mode_t, xfs_acl_t *);
+STATIC void	xfs_acl_get_attr(vnode_t *, xfs_acl_t *, int, int, int *);
+STATIC void	xfs_acl_set_attr(vnode_t *, xfs_acl_t *, int, int *);
+STATIC int	xfs_acl_allow_set(vnode_t *, int);
+
+kmem_zone_t *xfs_acl_zone;
+
+
+/*
+ * Test for existence of access ACL attribute as efficiently as possible.
+ */
+int
+xfs_acl_vhasacl_access(
+	vnode_t		*vp)
+{
+	int		error;
+
+	xfs_acl_get_attr(vp, NULL, _ACL_TYPE_ACCESS, ATTR_KERNOVAL, &error);
+	return (error == 0);
+}
+
+/*
+ * Test for existence of default ACL attribute as efficiently as possible.
+ */
+int
+xfs_acl_vhasacl_default(
+	vnode_t		*vp)
+{
+	int		error;
+
+	if (vp->v_type != VDIR)
+		return 0;
+	xfs_acl_get_attr(vp, NULL, _ACL_TYPE_DEFAULT, ATTR_KERNOVAL, &error);
+	return (error == 0);
+}
+
+/*
+ * Convert from extended attribute representation to in-memory for XFS.
+ */
+STATIC int
+posix_acl_xattr_to_xfs(
+	posix_acl_xattr_header	*src,
+	size_t			size,
+	xfs_acl_t		*dest)
+{
+	posix_acl_xattr_entry	*src_entry;
+	xfs_acl_entry_t		*dest_entry;
+	int			n;
+
+	if (!src || !dest)
+		return EINVAL;
+
+	if (size < sizeof(posix_acl_xattr_header))
+		return EINVAL;
+
+	if (src->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
+		return EINVAL;
+
+	memset(dest, 0, sizeof(xfs_acl_t));
+	dest->acl_cnt = posix_acl_xattr_count(size);
+	if (dest->acl_cnt < 0 || dest->acl_cnt > XFS_ACL_MAX_ENTRIES)
+		return EINVAL;
+
+	/*
+	 * acl_set_file(3) may request that we set default ACLs with
+	 * zero length -- defend (gracefully) against that here.
+	 */
+	if (!dest->acl_cnt)
+		return 0;
+
+	src_entry = (posix_acl_xattr_entry *)((char *)src + sizeof(*src));
+	dest_entry = &dest->acl_entry[0];
+
+	for (n = 0; n < dest->acl_cnt; n++, src_entry++, dest_entry++) {
+		dest_entry->ae_perm = le16_to_cpu(src_entry->e_perm);
+		if (_ACL_PERM_INVALID(dest_entry->ae_perm))
+			return EINVAL;
+		dest_entry->ae_tag  = le16_to_cpu(src_entry->e_tag);
+		switch(dest_entry->ae_tag) {
+		case ACL_USER:
+		case ACL_GROUP:
+			dest_entry->ae_id = le32_to_cpu(src_entry->e_id);
+			break;
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			dest_entry->ae_id = ACL_UNDEFINED_ID;
+			break;
+		default:
+			return EINVAL;
+		}
+	}
+	if (xfs_acl_invalid(dest))
+		return EINVAL;
+
+	return 0;
+}
+
+/*
+ * Comparison function called from qsort().
+ * Primary key is ae_tag, secondary key is ae_id.
+ */
+STATIC int
+xfs_acl_entry_compare(
+	const void	*va,
+	const void	*vb)
+{
+	xfs_acl_entry_t *a = (xfs_acl_entry_t *)va,
+			*b = (xfs_acl_entry_t *)vb;
+
+	if (a->ae_tag == b->ae_tag)
+		return (a->ae_id - b->ae_id);
+	return (a->ae_tag - b->ae_tag);
+}
+
+/*
+ * Convert from in-memory XFS to extended attribute representation.
+ */
+STATIC int
+posix_acl_xfs_to_xattr(
+	xfs_acl_t		*src,
+	posix_acl_xattr_header	*dest,
+	size_t			size)
+{
+	int			n;
+	size_t			new_size = posix_acl_xattr_size(src->acl_cnt);
+	posix_acl_xattr_entry	*dest_entry;
+	xfs_acl_entry_t		*src_entry;
+
+	if (size < new_size)
+		return -ERANGE;
+
+	/* Need to sort src XFS ACL by <ae_tag,ae_id> */
+	qsort(src->acl_entry, src->acl_cnt, sizeof(src->acl_entry[0]),
+		xfs_acl_entry_compare);
+
+	dest->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
+	dest_entry = &dest->a_entries[0];
+	src_entry = &src->acl_entry[0];
+	for (n = 0; n < src->acl_cnt; n++, dest_entry++, src_entry++) {
+		dest_entry->e_perm = cpu_to_le16(src_entry->ae_perm);
+		if (_ACL_PERM_INVALID(src_entry->ae_perm))
+			return -EINVAL;
+		dest_entry->e_tag  = cpu_to_le16(src_entry->ae_tag);
+		switch (src_entry->ae_tag) {
+		case ACL_USER:
+		case ACL_GROUP:
+			dest_entry->e_id = cpu_to_le32(src_entry->ae_id);
+				break;
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			dest_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+	return new_size;
+}
+
+int
+xfs_acl_vget(
+	vnode_t		*vp,
+	void		*acl,
+	size_t		size,
+	int		kind)
+{
+	int			error;
+	xfs_acl_t		*xfs_acl;
+	posix_acl_xattr_header	*ext_acl = acl;
+
+	VN_HOLD(vp);
+	if ((error = _MAC_VACCESS(vp, NULL, VREAD)))
+		goto out;
+	if (!(_ACL_ALLOC(xfs_acl))) {
+		error = ENOMEM;
+		goto out;
+	}
+
+	memset(xfs_acl, 0, sizeof(xfs_acl_t));
+	xfs_acl_get_attr(vp, xfs_acl, kind, size? 0 : ATTR_KERNOVAL, &error);
+	if (error)
+		goto out;
+
+	if (!size) {
+		error = -posix_acl_xattr_size(XFS_ACL_MAX_ENTRIES);
+	} else {
+		if (xfs_acl_invalid(xfs_acl)) {
+			error = EINVAL;
+			goto out;
+		}
+		if (kind == _ACL_TYPE_ACCESS) {
+			vattr_t va;
+
+			va.va_mask = AT_MODE;
+			VOP_GETATTR(vp, &va, 0, sys_cred, error);
+			if (error)
+				goto out;
+			xfs_acl_sync_mode(va.va_mode, xfs_acl);
+		}
+		error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size);
+	}
+out:
+	VN_RELE(vp);
+	_ACL_FREE(xfs_acl);
+	return -error;
+}
+
+int
+xfs_acl_vremove(
+	vnode_t		*vp,
+	int		kind)
+{
+	int		error;
+
+	VN_HOLD(vp);
+	error = xfs_acl_allow_set(vp, kind);
+	if (!error) {
+		VOP_ATTR_REMOVE(vp, kind == _ACL_TYPE_DEFAULT?
+				SGI_ACL_DEFAULT: SGI_ACL_FILE,
+				ATTR_ROOT, sys_cred, error);
+		if (error == ENOATTR)
+			error = 0;	/* 'scool */
+	}
+	VN_RELE(vp);
+	return -error;
+}
+
+int
+xfs_acl_vset(
+	vnode_t			*vp,
+	void			*acl,
+	size_t			size,
+	int			kind)
+{
+	posix_acl_xattr_header	*ext_acl = acl;
+	xfs_acl_t		*xfs_acl;
+	int			error;
+
+	if (!acl)
+		return -EINVAL;
+
+	if (!(_ACL_ALLOC(xfs_acl)))
+		return -ENOMEM;
+
+	error = posix_acl_xattr_to_xfs(ext_acl, size, xfs_acl);
+	if (error) {
+		_ACL_FREE(xfs_acl);
+		return -error;
+	}
+	if (!xfs_acl->acl_cnt) {
+		_ACL_FREE(xfs_acl);
+		return 0;
+	}
+
+	VN_HOLD(vp);
+	error = xfs_acl_allow_set(vp, kind);
+	if (error)
+		goto out;
+
+	/* Incoming ACL exists, set file mode based on its value */
+	if (kind == _ACL_TYPE_ACCESS)
+		xfs_acl_setmode(vp, xfs_acl);
+	xfs_acl_set_attr(vp, xfs_acl, kind, &error);
+out:
+	VN_RELE(vp);
+	_ACL_FREE(xfs_acl);
+	return -error;
+}
+
+int
+xfs_acl_iaccess(
+	xfs_inode_t	*ip,
+	mode_t		mode,
+	cred_t		*cr)
+{
+	xfs_acl_t	*acl;
+	int		error;
+
+	if (!(_ACL_ALLOC(acl)))
+		return -1;
+
+	/* If the file has no ACL return -1. */
+	if (xfs_attr_fetch(ip, SGI_ACL_FILE, (char *)acl, sizeof(xfs_acl_t))) {
+		_ACL_FREE(acl);
+		return -1;
+	}
+	xfs_acl_get_endian(acl);
+
+	/* If the file has an empty ACL return -1. */
+	if (acl->acl_cnt == XFS_ACL_NOT_PRESENT) {
+		_ACL_FREE(acl);
+		return -1;
+	}
+
+	/* Synchronize ACL with mode bits */
+	xfs_acl_sync_mode(ip->i_d.di_mode, acl);
+
+	error = xfs_acl_access(ip->i_d.di_uid, ip->i_d.di_gid, acl, mode, cr);
+	_ACL_FREE(acl);
+	return error;
+}
+
+STATIC int
+xfs_acl_allow_set(
+	vnode_t		*vp,
+	int		kind)
+{
+	vattr_t		va;
+	int		error;
+
+	if (kind == _ACL_TYPE_DEFAULT && vp->v_type != VDIR)
+		return ENOTDIR;
+	if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
+		return EROFS;
+	if ((error = _MAC_VACCESS(vp, NULL, VWRITE)))
+		return error;
+	va.va_mask = AT_UID;
+	VOP_GETATTR(vp, &va, 0, NULL, error);
+	if (error)
+		return error;
+	if (va.va_uid != current->fsuid && !capable(CAP_FOWNER))
+		return EPERM;
+	return error;
+}
+
+/*
+ * The access control process to determine the access permission:
+ *	if uid == file owner id, use the file owner bits.
+ *	if gid == file owner group id, use the file group bits.
+ *	scan ACL for a maching user or group, and use matched entry
+ *	permission. Use total permissions of all matching group entries,
+ *	until all acl entries are exhausted. The final permission produced
+ *	by matching acl entry or entries needs to be & with group permission.
+ *	if not owner, owning group, or matching entry in ACL, use file
+ *	other bits.
+ */
+STATIC int
+xfs_acl_capability_check(
+	mode_t		mode,
+	cred_t		*cr)
+{
+	if ((mode & ACL_READ) && !capable_cred(cr, CAP_DAC_READ_SEARCH))
+		return EACCES;
+	if ((mode & ACL_WRITE) && !capable_cred(cr, CAP_DAC_OVERRIDE))
+		return EACCES;
+	if ((mode & ACL_EXECUTE) && !capable_cred(cr, CAP_DAC_OVERRIDE))
+		return EACCES;
+	return 0;
+}
+
+/*
+ * Note: cr is only used here for the capability check if the ACL test fails.
+ *	 It is not used to find out the credentials uid or groups etc, as was
+ *	 done in IRIX. It is assumed that the uid and groups for the current
+ *	 thread are taken from "current" instead of the cr parameter.
+ */
+STATIC int
+xfs_acl_access(
+	uid_t		fuid,
+	gid_t		fgid,
+	xfs_acl_t	*fap,
+	mode_t		md,
+	cred_t		*cr)
+{
+	xfs_acl_entry_t matched;
+	int		i, allows;
+	int		maskallows = -1;	/* true, but not 1, either */
+	int		seen_userobj = 0;
+
+	matched.ae_tag = 0;	/* Invalid type */
+	md >>= 6;	/* Normalize the bits for comparison */
+
+	for (i = 0; i < fap->acl_cnt; i++) {
+		/*
+		 * Break out if we've got a user_obj entry or
+		 * a user entry and the mask (and have processed USER_OBJ)
+		 */
+		if (matched.ae_tag == ACL_USER_OBJ)
+			break;
+		if (matched.ae_tag == ACL_USER) {
+			if (maskallows != -1 && seen_userobj)
+				break;
+			if (fap->acl_entry[i].ae_tag != ACL_MASK &&
+			    fap->acl_entry[i].ae_tag != ACL_USER_OBJ)
+				continue;
+		}
+		/* True if this entry allows the requested access */
+		allows = ((fap->acl_entry[i].ae_perm & md) == md);
+
+		switch (fap->acl_entry[i].ae_tag) {
+		case ACL_USER_OBJ:
+			seen_userobj = 1;
+			if (fuid != current->fsuid)
+				continue;
+			matched.ae_tag = ACL_USER_OBJ;
+			matched.ae_perm = allows;
+			break;
+		case ACL_USER:
+			if (fap->acl_entry[i].ae_id != current->fsuid)
+				continue;
+			matched.ae_tag = ACL_USER;
+			matched.ae_perm = allows;
+			break;
+		case ACL_GROUP_OBJ:
+			if ((matched.ae_tag == ACL_GROUP_OBJ ||
+			    matched.ae_tag == ACL_GROUP) && !allows)
+				continue;
+			if (!in_group_p(fgid))
+				continue;
+			matched.ae_tag = ACL_GROUP_OBJ;
+			matched.ae_perm = allows;
+			break;
+		case ACL_GROUP:
+			if ((matched.ae_tag == ACL_GROUP_OBJ ||
+			    matched.ae_tag == ACL_GROUP) && !allows)
+				continue;
+			if (!in_group_p(fap->acl_entry[i].ae_id))
+				continue;
+			matched.ae_tag = ACL_GROUP;
+			matched.ae_perm = allows;
+			break;
+		case ACL_MASK:
+			maskallows = allows;
+			break;
+		case ACL_OTHER:
+			if (matched.ae_tag != 0)
+				continue;
+			matched.ae_tag = ACL_OTHER;
+			matched.ae_perm = allows;
+			break;
+		}
+	}
+	/*
+	 * First possibility is that no matched entry allows access.
+	 * The capability to override DAC may exist, so check for it.
+	 */
+	switch (matched.ae_tag) {
+	case ACL_OTHER:
+	case ACL_USER_OBJ:
+		if (matched.ae_perm)
+			return 0;
+		break;
+	case ACL_USER:
+	case ACL_GROUP_OBJ:
+	case ACL_GROUP:
+		if (maskallows && matched.ae_perm)
+			return 0;
+		break;
+	case 0:
+		break;
+	}
+	return xfs_acl_capability_check(md, cr);
+}
+
+/*
+ * ACL validity checker.
+ *   This acl validation routine checks each ACL entry read in makes sense.
+ */
+STATIC int
+xfs_acl_invalid(
+	xfs_acl_t	*aclp)
+{
+	xfs_acl_entry_t *entry, *e;
+	int		user = 0, group = 0, other = 0, mask = 0;
+	int		mask_required = 0;
+	int		i, j;
+
+	if (!aclp)
+		goto acl_invalid;
+
+	if (aclp->acl_cnt > XFS_ACL_MAX_ENTRIES)
+		goto acl_invalid;
+
+	for (i = 0; i < aclp->acl_cnt; i++) {
+		entry = &aclp->acl_entry[i];
+		switch (entry->ae_tag) {
+		case ACL_USER_OBJ:
+			if (user++)
+				goto acl_invalid;
+			break;
+		case ACL_GROUP_OBJ:
+			if (group++)
+				goto acl_invalid;
+			break;
+		case ACL_OTHER:
+			if (other++)
+				goto acl_invalid;
+			break;
+		case ACL_USER:
+		case ACL_GROUP:
+			for (j = i + 1; j < aclp->acl_cnt; j++) {
+				e = &aclp->acl_entry[j];
+				if (e->ae_id == entry->ae_id &&
+				    e->ae_tag == entry->ae_tag)
+					goto acl_invalid;
+			}
+			mask_required++;
+			break;
+		case ACL_MASK:
+			if (mask++)
+				goto acl_invalid;
+			break;
+		default:
+			goto acl_invalid;
+		}
+	}
+	if (!user || !group || !other || (mask_required && !mask))
+		goto acl_invalid;
+	else
+		return 0;
+acl_invalid:
+	return EINVAL;
+}
+
+/*
+ * Do ACL endian conversion.
+ */
+STATIC void
+xfs_acl_get_endian(
+	xfs_acl_t	*aclp)
+{
+	xfs_acl_entry_t *ace, *end;
+
+	INT_SET(aclp->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
+	end = &aclp->acl_entry[0]+aclp->acl_cnt;
+	for (ace = &aclp->acl_entry[0]; ace < end; ace++) {
+		INT_SET(ace->ae_tag, ARCH_CONVERT, ace->ae_tag);
+		INT_SET(ace->ae_id, ARCH_CONVERT, ace->ae_id);
+		INT_SET(ace->ae_perm, ARCH_CONVERT, ace->ae_perm);
+	}
+}
+
+/*
+ * Get the ACL from the EA and do endian conversion.
+ */
+STATIC void
+xfs_acl_get_attr(
+	vnode_t		*vp,
+	xfs_acl_t	*aclp,
+	int		kind,
+	int		flags,
+	int		*error)
+{
+	int		len = sizeof(xfs_acl_t);
+
+	flags |= ATTR_ROOT;
+	VOP_ATTR_GET(vp,
+		kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE : SGI_ACL_DEFAULT,
+		(char *)aclp, &len, flags, sys_cred, *error);
+	if (*error || (flags & ATTR_KERNOVAL))
+		return;
+	xfs_acl_get_endian(aclp);
+}
+
+/*
+ * Set the EA with the ACL and do endian conversion.
+ */
+STATIC void
+xfs_acl_set_attr(
+	vnode_t		*vp,
+	xfs_acl_t	*aclp,
+	int		kind,
+	int		*error)
+{
+	xfs_acl_entry_t *ace, *newace, *end;
+	xfs_acl_t	*newacl;
+	int		len;
+
+	if (!(_ACL_ALLOC(newacl))) {
+		*error = ENOMEM;
+		return;
+	}
+
+	len = sizeof(xfs_acl_t) -
+	      (sizeof(xfs_acl_entry_t) * (XFS_ACL_MAX_ENTRIES - aclp->acl_cnt));
+	end = &aclp->acl_entry[0]+aclp->acl_cnt;
+	for (ace = &aclp->acl_entry[0], newace = &newacl->acl_entry[0];
+	     ace < end;
+	     ace++, newace++) {
+		INT_SET(newace->ae_tag, ARCH_CONVERT, ace->ae_tag);
+		INT_SET(newace->ae_id, ARCH_CONVERT, ace->ae_id);
+		INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm);
+	}
+	INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
+	VOP_ATTR_SET(vp,
+		kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE: SGI_ACL_DEFAULT,
+		(char *)newacl, len, ATTR_ROOT, sys_cred, *error);
+	_ACL_FREE(newacl);
+}
+
+int
+xfs_acl_vtoacl(
+	vnode_t		*vp,
+	xfs_acl_t	*access_acl,
+	xfs_acl_t	*default_acl)
+{
+	vattr_t		va;
+	int		error = 0;
+
+	if (access_acl) {
+		/*
+		 * Get the Access ACL and the mode.  If either cannot
+		 * be obtained for some reason, invalidate the access ACL.
+		 */
+		xfs_acl_get_attr(vp, access_acl, _ACL_TYPE_ACCESS, 0, &error);
+		if (!error) {
+			/* Got the ACL, need the mode... */
+			va.va_mask = AT_MODE;
+			VOP_GETATTR(vp, &va, 0, sys_cred, error);
+		}
+
+		if (error)
+			access_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
+		else /* We have a good ACL and the file mode, synchronize. */
+			xfs_acl_sync_mode(va.va_mode, access_acl);
+	}
+
+	if (default_acl) {
+		xfs_acl_get_attr(vp, default_acl, _ACL_TYPE_DEFAULT, 0, &error);
+		if (error)
+			default_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
+	}
+	return error;
+}
+
+/*
+ * This function retrieves the parent directory's acl, processes it
+ * and lets the child inherit the acl(s) that it should.
+ */
+int
+xfs_acl_inherit(
+	vnode_t		*vp,
+	vattr_t		*vap,
+	xfs_acl_t	*pdaclp)
+{
+	xfs_acl_t	*cacl;
+	int		error = 0;
+
+	/*
+	 * If the parent does not have a default ACL, or it's an
+	 * invalid ACL, we're done.
+	 */
+	if (!vp)
+		return 0;
+	if (!pdaclp || xfs_acl_invalid(pdaclp))
+		return 0;
+
+	/*
+	 * Copy the default ACL of the containing directory to
+	 * the access ACL of the new file and use the mode that
+	 * was passed in to set up the correct initial values for
+	 * the u::,g::[m::], and o:: entries.  This is what makes
+	 * umask() "work" with ACL's.
+	 */
+
+	if (!(_ACL_ALLOC(cacl)))
+		return ENOMEM;
+
+	memcpy(cacl, pdaclp, sizeof(xfs_acl_t));
+	xfs_acl_filter_mode(vap->va_mode, cacl);
+	xfs_acl_setmode(vp, cacl);
+
+	/*
+	 * Set the Default and Access ACL on the file.	The mode is already
+	 * set on the file, so we don't need to worry about that.
+	 *
+	 * If the new file is a directory, its default ACL is a copy of
+	 * the containing directory's default ACL.
+	 */
+	if (vp->v_type == VDIR)
+		xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error);
+	if (!error)
+		xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error);
+	_ACL_FREE(cacl);
+	return error;
+}
+
+/*
+ * Set up the correct mode on the file based on the supplied ACL.  This
+ * makes sure that the mode on the file reflects the state of the
+ * u::,g::[m::], and o:: entries in the ACL.  Since the mode is where
+ * the ACL is going to get the permissions for these entries, we must
+ * synchronize the mode whenever we set the ACL on a file.
+ */
+STATIC int
+xfs_acl_setmode(
+	vnode_t		*vp,
+	xfs_acl_t	*acl)
+{
+	vattr_t		va;
+	xfs_acl_entry_t *ap;
+	xfs_acl_entry_t *gap = NULL;
+	int		i, error, nomask = 1;
+
+	if (acl->acl_cnt == XFS_ACL_NOT_PRESENT)
+		return 0;
+
+	/*
+	 * Copy the u::, g::, o::, and m:: bits from the ACL into the
+	 * mode.  The m:: bits take precedence over the g:: bits.
+	 */
+	va.va_mask = AT_MODE;
+	VOP_GETATTR(vp, &va, 0, sys_cred, error);
+	if (error)
+		return error;
+
+	va.va_mask = AT_MODE;
+	va.va_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO);
+	ap = acl->acl_entry;
+	for (i = 0; i < acl->acl_cnt; ++i) {
+		switch (ap->ae_tag) {
+		case ACL_USER_OBJ:
+			va.va_mode |= ap->ae_perm << 6;
+			break;
+		case ACL_GROUP_OBJ:
+			gap = ap;
+			break;
+		case ACL_MASK:
+			nomask = 0;
+			va.va_mode |= ap->ae_perm << 3;
+			break;
+		case ACL_OTHER:
+			va.va_mode |= ap->ae_perm;
+			break;
+		default:
+			break;
+		}
+		ap++;
+	}
+
+	/* Set the group bits from ACL_GROUP_OBJ if there's no ACL_MASK */
+	if (gap && nomask)
+		va.va_mode |= gap->ae_perm << 3;
+
+	VOP_SETATTR(vp, &va, 0, sys_cred, error);
+	return error;
+}
+
+/*
+ * The permissions for the special ACL entries (u::, g::[m::], o::) are
+ * actually stored in the file mode (if there is both a group and a mask,
+ * the group is stored in the ACL entry and the mask is stored on the file).
+ * This allows the mode to remain automatically in sync with the ACL without
+ * the need for a call-back to the ACL system at every point where the mode
+ * could change.  This function takes the permissions from the specified mode
+ * and places it in the supplied ACL.
+ *
+ * This implementation draws its validity from the fact that, when the ACL
+ * was assigned, the mode was copied from the ACL.
+ * If the mode did not change, therefore, the mode remains exactly what was
+ * taken from the special ACL entries at assignment.
+ * If a subsequent chmod() was done, the POSIX spec says that the change in
+ * mode must cause an update to the ACL seen at user level and used for
+ * access checks.  Before and after a mode change, therefore, the file mode
+ * most accurately reflects what the special ACL entries should permit/deny.
+ *
+ * CAVEAT: If someone sets the SGI_ACL_FILE attribute directly,
+ *	   the existing mode bits will override whatever is in the
+ *	   ACL. Similarly, if there is a pre-existing ACL that was
+ *	   never in sync with its mode (owing to a bug in 6.5 and
+ *	   before), it will now magically (or mystically) be
+ *	   synchronized.  This could cause slight astonishment, but
+ *	   it is better than inconsistent permissions.
+ *
+ * The supplied ACL is a template that may contain any combination
+ * of special entries.	These are treated as place holders when we fill
+ * out the ACL.	 This routine does not add or remove special entries, it
+ * simply unites each special entry with its associated set of permissions.
+ */
+STATIC void
+xfs_acl_sync_mode(
+	mode_t		mode,
+	xfs_acl_t	*acl)
+{
+	int		i, nomask = 1;
+	xfs_acl_entry_t *ap;
+	xfs_acl_entry_t *gap = NULL;
+
+	/*
+	 * Set ACL entries. POSIX1003.1eD16 requires that the MASK
+	 * be set instead of the GROUP entry, if there is a MASK.
+	 */
+	for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) {
+		switch (ap->ae_tag) {
+		case ACL_USER_OBJ:
+			ap->ae_perm = (mode >> 6) & 0x7;
+			break;
+		case ACL_GROUP_OBJ:
+			gap = ap;
+			break;
+		case ACL_MASK:
+			nomask = 0;
+			ap->ae_perm = (mode >> 3) & 0x7;
+			break;
+		case ACL_OTHER:
+			ap->ae_perm = mode & 0x7;
+			break;
+		default:
+			break;
+		}
+	}
+	/* Set the ACL_GROUP_OBJ if there's no ACL_MASK */
+	if (gap && nomask)
+		gap->ae_perm = (mode >> 3) & 0x7;
+}
+
+/*
+ * When inheriting an Access ACL from a directory Default ACL,
+ * the ACL bits are set to the intersection of the ACL default
+ * permission bits and the file permission bits in mode. If there
+ * are no permission bits on the file then we must not give them
+ * the ACL. This is what what makes umask() work with ACLs.
+ */
+STATIC void
+xfs_acl_filter_mode(
+	mode_t		mode,
+	xfs_acl_t	*acl)
+{
+	int		i, nomask = 1;
+	xfs_acl_entry_t *ap;
+	xfs_acl_entry_t *gap = NULL;
+
+	/*
+	 * Set ACL entries. POSIX1003.1eD16 requires that the MASK
+	 * be merged with GROUP entry, if there is a MASK.
+	 */
+	for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) {
+		switch (ap->ae_tag) {
+		case ACL_USER_OBJ:
+			ap->ae_perm &= (mode >> 6) & 0x7;
+			break;
+		case ACL_GROUP_OBJ:
+			gap = ap;
+			break;
+		case ACL_MASK:
+			nomask = 0;
+			ap->ae_perm &= (mode >> 3) & 0x7;
+			break;
+		case ACL_OTHER:
+			ap->ae_perm &= mode & 0x7;
+			break;
+		default:
+			break;
+		}
+	}
+	/* Set the ACL_GROUP_OBJ if there's no ACL_MASK */
+	if (gap && nomask)
+		gap->ae_perm &= (mode >> 3) & 0x7;
+}
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
new file mode 100644
index 000000000000..c6fd578694a5
--- /dev/null
+++ b/fs/xfs/xfs_acl.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2001-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ACL_H__
+#define __XFS_ACL_H__
+
+/*
+ * Access Control Lists
+ */
+typedef __uint16_t	xfs_acl_perm_t;
+typedef __int32_t	xfs_acl_type_t;
+typedef __int32_t	xfs_acl_tag_t;
+typedef __int32_t	xfs_acl_id_t;
+
+#define XFS_ACL_MAX_ENTRIES 25
+#define XFS_ACL_NOT_PRESENT (-1)
+
+typedef struct xfs_acl_entry {
+	xfs_acl_tag_t	ae_tag;
+	xfs_acl_id_t	ae_id;
+	xfs_acl_perm_t	ae_perm;
+} xfs_acl_entry_t;
+
+typedef struct xfs_acl {
+	__int32_t	acl_cnt;
+	xfs_acl_entry_t acl_entry[XFS_ACL_MAX_ENTRIES];
+} xfs_acl_t;
+
+/* On-disk XFS extended attribute names */
+#define SGI_ACL_FILE	"SGI_ACL_FILE"
+#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT"
+#define SGI_ACL_FILE_SIZE	(sizeof(SGI_ACL_FILE)-1)
+#define SGI_ACL_DEFAULT_SIZE	(sizeof(SGI_ACL_DEFAULT)-1)
+
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+struct vattr;
+struct vnode;
+struct xfs_inode;
+
+extern int xfs_acl_inherit(struct vnode *, struct vattr *, xfs_acl_t *);
+extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *);
+extern int xfs_acl_get(struct vnode *, xfs_acl_t *, xfs_acl_t *);
+extern int xfs_acl_set(struct vnode *, xfs_acl_t *, xfs_acl_t *);
+extern int xfs_acl_vtoacl(struct vnode *, xfs_acl_t *, xfs_acl_t *);
+extern int xfs_acl_vhasacl_access(struct vnode *);
+extern int xfs_acl_vhasacl_default(struct vnode *);
+extern int xfs_acl_vset(struct vnode *, void *, size_t, int);
+extern int xfs_acl_vget(struct vnode *, void *, size_t, int);
+extern int xfs_acl_vremove(struct vnode *vp, int);
+
+extern struct kmem_zone *xfs_acl_zone;
+
+#define _ACL_TYPE_ACCESS	1
+#define _ACL_TYPE_DEFAULT	2
+#define _ACL_PERM_INVALID(perm) ((perm) & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE))
+
+#define _ACL_DECL(a)		xfs_acl_t *(a) = NULL
+#define _ACL_ALLOC(a)		((a) = kmem_zone_alloc(xfs_acl_zone, KM_SLEEP))
+#define _ACL_FREE(a)		((a)? kmem_zone_free(xfs_acl_zone, (a)) : 0)
+#define _ACL_ZONE_INIT(z,name)	((z) = kmem_zone_init(sizeof(xfs_acl_t), name))
+#define _ACL_ZONE_DESTROY(z)	(kmem_cache_destroy(z))
+#define _ACL_INHERIT(c,v,d)	(xfs_acl_inherit(c,v,d))
+#define _ACL_GET_ACCESS(pv,pa)	(xfs_acl_vtoacl(pv,pa,NULL) == 0)
+#define _ACL_GET_DEFAULT(pv,pd) (xfs_acl_vtoacl(pv,NULL,pd) == 0)
+#define _ACL_ACCESS_EXISTS	xfs_acl_vhasacl_access
+#define _ACL_DEFAULT_EXISTS	xfs_acl_vhasacl_default
+#define _ACL_XFS_IACCESS(i,m,c) (XFS_IFORK_Q(i) ? xfs_acl_iaccess(i,m,c) : -1)
+
+#else
+#define xfs_acl_vset(v,p,sz,t)	(-EOPNOTSUPP)
+#define xfs_acl_vget(v,p,sz,t)	(-EOPNOTSUPP)
+#define xfs_acl_vremove(v,t)	(-EOPNOTSUPP)
+#define _ACL_DECL(a)		((void)0)
+#define _ACL_ALLOC(a)		(1)	/* successfully allocate nothing */
+#define _ACL_FREE(a)		((void)0)
+#define _ACL_ZONE_INIT(z,name)	((void)0)
+#define _ACL_ZONE_DESTROY(z)	((void)0)
+#define _ACL_INHERIT(c,v,d)	(0)
+#define _ACL_GET_ACCESS(pv,pa)	(0)
+#define _ACL_GET_DEFAULT(pv,pd) (0)
+#define _ACL_ACCESS_EXISTS	(NULL)
+#define _ACL_DEFAULT_EXISTS	(NULL)
+#define _ACL_XFS_IACCESS(i,m,c) (-1)
+#endif
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_ACL_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
new file mode 100644
index 000000000000..2bec8dd18c52
--- /dev/null
+++ b/fs/xfs/xfs_ag.h
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_AG_H__
+#define __XFS_AG_H__
+
+/*
+ * Allocation group header
+ * This is divided into three structures, placed in sequential 512-byte
+ * buffers after a copy of the superblock (also in a 512-byte buffer).
+ */
+
+struct xfs_buf;
+struct xfs_mount;
+struct xfs_trans;
+
+#define XFS_AGF_MAGIC	0x58414746	/* 'XAGF' */
+#define XFS_AGI_MAGIC	0x58414749	/* 'XAGI' */
+#define XFS_AGF_VERSION 1
+#define XFS_AGI_VERSION 1
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGF_GOOD_VERSION)
+int xfs_agf_good_version(unsigned v);
+#define XFS_AGF_GOOD_VERSION(v) xfs_agf_good_version(v)
+#else
+#define XFS_AGF_GOOD_VERSION(v)		((v) == XFS_AGF_VERSION)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGI_GOOD_VERSION)
+int xfs_agi_good_version(unsigned v);
+#define XFS_AGI_GOOD_VERSION(v) xfs_agi_good_version(v)
+#else
+#define XFS_AGI_GOOD_VERSION(v)		((v) == XFS_AGI_VERSION)
+#endif
+
+/*
+ * Btree number 0 is bno, 1 is cnt.  This value gives the size of the
+ * arrays below.
+ */
+#define XFS_BTNUM_AGF	((int)XFS_BTNUM_CNTi + 1)
+
+/*
+ * The second word of agf_levels in the first a.g. overlaps the EFS
+ * superblock's magic number.  Since the magic numbers valid for EFS
+ * are > 64k, our value cannot be confused for an EFS superblock's.
+ */
+
+typedef struct xfs_agf
+{
+	/*
+	 * Common allocation group header information
+	 */
+	__uint32_t	agf_magicnum;	/* magic number == XFS_AGF_MAGIC */
+	__uint32_t	agf_versionnum; /* header version == XFS_AGF_VERSION */
+	xfs_agnumber_t	agf_seqno;	/* sequence # starting from 0 */
+	xfs_agblock_t	agf_length;	/* size in blocks of a.g. */
+	/*
+	 * Freespace information
+	 */
+	xfs_agblock_t	agf_roots[XFS_BTNUM_AGF];	/* root blocks */
+	__uint32_t	agf_spare0;	/* spare field */
+	__uint32_t	agf_levels[XFS_BTNUM_AGF];	/* btree levels */
+	__uint32_t	agf_spare1;	/* spare field */
+	__uint32_t	agf_flfirst;	/* first freelist block's index */
+	__uint32_t	agf_fllast;	/* last freelist block's index */
+	__uint32_t	agf_flcount;	/* count of blocks in freelist */
+	xfs_extlen_t	agf_freeblks;	/* total free blocks */
+	xfs_extlen_t	agf_longest;	/* longest free space */
+} xfs_agf_t;
+
+#define XFS_AGF_MAGICNUM	0x00000001
+#define XFS_AGF_VERSIONNUM	0x00000002
+#define XFS_AGF_SEQNO		0x00000004
+#define XFS_AGF_LENGTH		0x00000008
+#define XFS_AGF_ROOTS		0x00000010
+#define XFS_AGF_LEVELS		0x00000020
+#define XFS_AGF_FLFIRST		0x00000040
+#define XFS_AGF_FLLAST		0x00000080
+#define XFS_AGF_FLCOUNT		0x00000100
+#define XFS_AGF_FREEBLKS	0x00000200
+#define XFS_AGF_LONGEST		0x00000400
+#define XFS_AGF_NUM_BITS	11
+#define XFS_AGF_ALL_BITS	((1 << XFS_AGF_NUM_BITS) - 1)
+
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGF_DADDR		((xfs_daddr_t)1)
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGF_BLOCK)
+xfs_agblock_t xfs_agf_block(struct xfs_mount *mp);
+#define XFS_AGF_BLOCK(mp)	xfs_agf_block(mp)
+#else
+#define XFS_AGF_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGF_DADDR)
+#endif
+
+/*
+ * Size of the unlinked inode hash table in the agi.
+ */
+#define XFS_AGI_UNLINKED_BUCKETS	64
+
+typedef struct xfs_agi
+{
+	/*
+	 * Common allocation group header information
+	 */
+	__uint32_t	agi_magicnum;	/* magic number == XFS_AGI_MAGIC */
+	__uint32_t	agi_versionnum; /* header version == XFS_AGI_VERSION */
+	xfs_agnumber_t	agi_seqno;	/* sequence # starting from 0 */
+	xfs_agblock_t	agi_length;	/* size in blocks of a.g. */
+	/*
+	 * Inode information
+	 * Inodes are mapped by interpreting the inode number, so no
+	 * mapping data is needed here.
+	 */
+	xfs_agino_t	agi_count;	/* count of allocated inodes */
+	xfs_agblock_t	agi_root;	/* root of inode btree */
+	__uint32_t	agi_level;	/* levels in inode btree */
+	xfs_agino_t	agi_freecount;	/* number of free inodes */
+	xfs_agino_t	agi_newino;	/* new inode just allocated */
+	xfs_agino_t	agi_dirino;	/* last directory inode chunk */
+	/*
+	 * Hash table of inodes which have been unlinked but are
+	 * still being referenced.
+	 */
+	xfs_agino_t	agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
+} xfs_agi_t;
+
+#define XFS_AGI_MAGICNUM	0x00000001
+#define XFS_AGI_VERSIONNUM	0x00000002
+#define XFS_AGI_SEQNO		0x00000004
+#define XFS_AGI_LENGTH		0x00000008
+#define XFS_AGI_COUNT		0x00000010
+#define XFS_AGI_ROOT		0x00000020
+#define XFS_AGI_LEVEL		0x00000040
+#define XFS_AGI_FREECOUNT	0x00000080
+#define XFS_AGI_NEWINO		0x00000100
+#define XFS_AGI_DIRINO		0x00000200
+#define XFS_AGI_UNLINKED	0x00000400
+#define XFS_AGI_NUM_BITS	11
+#define XFS_AGI_ALL_BITS	((1 << XFS_AGI_NUM_BITS) - 1)
+
+/* disk block (xfs_daddr_t) in the AG */
+#define XFS_AGI_DADDR		((xfs_daddr_t)2)
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGI_BLOCK)
+xfs_agblock_t xfs_agi_block(struct xfs_mount *mp);
+#define XFS_AGI_BLOCK(mp)	xfs_agi_block(mp)
+#else
+#define XFS_AGI_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGI_DADDR)
+#endif
+
+/*
+ * The third a.g. block contains the a.g. freelist, an array
+ * of block pointers to blocks owned by the allocation btree code.
+ */
+#define XFS_AGFL_DADDR		((xfs_daddr_t)3)
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGFL_BLOCK)
+xfs_agblock_t xfs_agfl_block(struct xfs_mount *mp);
+#define XFS_AGFL_BLOCK(mp)	xfs_agfl_block(mp)
+#else
+#define XFS_AGFL_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR)
+#endif
+#define XFS_AGFL_SIZE		(BBSIZE / sizeof(xfs_agblock_t))
+typedef struct xfs_agfl
+{
+	xfs_agblock_t	agfl_bno[XFS_AGFL_SIZE];
+} xfs_agfl_t;
+
+/*
+ * Busy block/extent entry.  Used in perag to mark blocks that have been freed
+ * but whose transactions aren't committed to disk yet.
+ */
+typedef struct xfs_perag_busy {
+	xfs_agblock_t	busy_start;
+	xfs_extlen_t	busy_length;
+	struct xfs_trans *busy_tp;	/* transaction that did the free */
+} xfs_perag_busy_t;
+
+/*
+ * Per-ag incore structure, copies of information in agf and agi,
+ * to improve the performance of allocation group selection.
+ *
+ * pick sizes which fit in allocation buckets well
+ */
+#if (BITS_PER_LONG == 32)
+#define XFS_PAGB_NUM_SLOTS	84
+#elif (BITS_PER_LONG == 64)
+#define XFS_PAGB_NUM_SLOTS	128
+#endif
+
+typedef struct xfs_perag
+{
+	char		pagf_init;	/* this agf's entry is initialized */
+	char		pagi_init;	/* this agi's entry is initialized */
+	char		pagf_metadata;	/* the agf is prefered to be metadata */
+	char		pagi_inodeok;	/* The agi is ok for inodes */
+	__uint8_t	pagf_levels[XFS_BTNUM_AGF];
+					/* # of levels in bno & cnt btree */
+	__uint32_t	pagf_flcount;	/* count of blocks in freelist */
+	xfs_extlen_t	pagf_freeblks;	/* total free blocks */
+	xfs_extlen_t	pagf_longest;	/* longest free space */
+	xfs_agino_t	pagi_freecount; /* number of free inodes */
+#ifdef __KERNEL__
+	lock_t		pagb_lock;	/* lock for pagb_list */
+#endif
+	int		pagb_count;	/* pagb slots in use */
+	xfs_perag_busy_t *pagb_list;	/* unstable blocks */
+} xfs_perag_t;
+
+#define XFS_AG_MIN_BYTES	(1LL << 24)	/* 16 MB */
+#define XFS_AG_BEST_BYTES	(1LL << 30)	/*  1 GB */
+#define XFS_AG_MAX_BYTES	(1LL << 32)	/*  4 GB */
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_MIN_BLOCKS)
+xfs_extlen_t xfs_ag_min_blocks(int bl);
+#define XFS_AG_MIN_BLOCKS(bl)		xfs_ag_min_blocks(bl)
+#else
+#define XFS_AG_MIN_BLOCKS(bl)	((xfs_extlen_t)(XFS_AG_MIN_BYTES >> bl))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_BEST_BLOCKS)
+xfs_extlen_t xfs_ag_best_blocks(int bl, xfs_drfsbno_t blks);
+#define XFS_AG_BEST_BLOCKS(bl,blks)	xfs_ag_best_blocks(bl,blks)
+#else
+/*--#define	XFS_AG_BEST_BLOCKS(bl)	((xfs_extlen_t)(XFS_AG_BEST_BYTES >> bl))*/
+/*
+ * Best is XFS_AG_BEST_BLOCKS at and below 64 Gigabyte filesystems, and
+ * XFS_AG_MAX_BLOCKS above 64 Gigabytes.
+ */
+#define XFS_AG_BEST_BLOCKS(bl,blks)	((xfs_extlen_t)((1LL << (36 - bl)) >= \
+							blks) ? \
+							((xfs_extlen_t)(XFS_AG_BEST_BYTES >> bl)) : \
+							XFS_AG_MAX_BLOCKS(bl))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_MAX_BLOCKS)
+xfs_extlen_t xfs_ag_max_blocks(int bl);
+#define XFS_AG_MAX_BLOCKS(bl)		xfs_ag_max_blocks(bl)
+#else
+#define XFS_AG_MAX_BLOCKS(bl)	((xfs_extlen_t)(XFS_AG_MAX_BYTES >> bl))
+#endif
+
+#define XFS_MAX_AGNUMBER	((xfs_agnumber_t)(NULLAGNUMBER - 1))
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_MAXLEVELS)
+int xfs_ag_maxlevels(struct xfs_mount *mp);
+#define XFS_AG_MAXLEVELS(mp)		xfs_ag_maxlevels(mp)
+#else
+#define XFS_AG_MAXLEVELS(mp)	((mp)->m_ag_maxlevels)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MIN_FREELIST)
+int xfs_min_freelist(xfs_agf_t *a, struct xfs_mount *mp);
+#define XFS_MIN_FREELIST(a,mp)		xfs_min_freelist(a,mp)
+#else
+#define XFS_MIN_FREELIST(a,mp)	\
+	XFS_MIN_FREELIST_RAW(	\
+		INT_GET((a)->agf_levels[XFS_BTNUM_BNOi], ARCH_CONVERT), \
+		INT_GET((a)->agf_levels[XFS_BTNUM_CNTi], ARCH_CONVERT), mp)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MIN_FREELIST_PAG)
+int xfs_min_freelist_pag(xfs_perag_t *pag, struct xfs_mount *mp);
+#define XFS_MIN_FREELIST_PAG(pag,mp)	xfs_min_freelist_pag(pag,mp)
+#else
+#define XFS_MIN_FREELIST_PAG(pag,mp)	\
+	XFS_MIN_FREELIST_RAW((uint_t)(pag)->pagf_levels[XFS_BTNUM_BNOi], \
+			     (uint_t)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MIN_FREELIST_RAW)
+int xfs_min_freelist_raw(int bl, int cl, struct xfs_mount *mp);
+#define XFS_MIN_FREELIST_RAW(bl,cl,mp)	xfs_min_freelist_raw(bl,cl,mp)
+#else
+#define XFS_MIN_FREELIST_RAW(bl,cl,mp)	\
+	(MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + \
+	 MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGB_TO_FSB)
+xfs_fsblock_t xfs_agb_to_fsb(struct xfs_mount *mp, xfs_agnumber_t agno,
+			     xfs_agblock_t agbno);
+#define XFS_AGB_TO_FSB(mp,agno,agbno)	xfs_agb_to_fsb(mp,agno,agbno)
+#else
+#define XFS_AGB_TO_FSB(mp,agno,agbno) \
+	(((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_TO_AGNO)
+xfs_agnumber_t xfs_fsb_to_agno(struct xfs_mount *mp, xfs_fsblock_t fsbno);
+#define XFS_FSB_TO_AGNO(mp,fsbno)	xfs_fsb_to_agno(mp,fsbno)
+#else
+#define XFS_FSB_TO_AGNO(mp,fsbno) \
+	((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_TO_AGBNO)
+xfs_agblock_t xfs_fsb_to_agbno(struct xfs_mount *mp, xfs_fsblock_t fsbno);
+#define XFS_FSB_TO_AGBNO(mp,fsbno)	xfs_fsb_to_agbno(mp,fsbno)
+#else
+#define XFS_FSB_TO_AGBNO(mp,fsbno) \
+	((xfs_agblock_t)((fsbno) & XFS_MASK32LO((mp)->m_sb.sb_agblklog)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGB_TO_DADDR)
+xfs_daddr_t xfs_agb_to_daddr(struct xfs_mount *mp, xfs_agnumber_t agno,
+			 xfs_agblock_t agbno);
+#define XFS_AGB_TO_DADDR(mp,agno,agbno) xfs_agb_to_daddr(mp,agno,agbno)
+#else
+#define XFS_AGB_TO_DADDR(mp,agno,agbno) \
+	((xfs_daddr_t)(XFS_FSB_TO_BB(mp, \
+		(xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno))))
+#endif
+/*
+ * XFS_DADDR_TO_AGNO and XFS_DADDR_TO_AGBNO moved to xfs_mount.h
+ * to avoid header file ordering change
+ */
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_DADDR)
+xfs_daddr_t xfs_ag_daddr(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_daddr_t d);
+#define XFS_AG_DADDR(mp,agno,d)		xfs_ag_daddr(mp,agno,d)
+#else
+#define XFS_AG_DADDR(mp,agno,d) (XFS_AGB_TO_DADDR(mp, agno, 0) + (d))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_AGF)
+xfs_agf_t *xfs_buf_to_agf(struct xfs_buf *bp);
+#define XFS_BUF_TO_AGF(bp)		xfs_buf_to_agf(bp)
+#else
+#define XFS_BUF_TO_AGF(bp)	((xfs_agf_t *)XFS_BUF_PTR(bp))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_AGI)
+xfs_agi_t *xfs_buf_to_agi(struct xfs_buf *bp);
+#define XFS_BUF_TO_AGI(bp)		xfs_buf_to_agi(bp)
+#else
+#define XFS_BUF_TO_AGI(bp)	((xfs_agi_t *)XFS_BUF_PTR(bp))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_AGFL)
+xfs_agfl_t *xfs_buf_to_agfl(struct xfs_buf *bp);
+#define XFS_BUF_TO_AGFL(bp)		xfs_buf_to_agfl(bp)
+#else
+#define XFS_BUF_TO_AGFL(bp)	((xfs_agfl_t *)XFS_BUF_PTR(bp))
+#endif
+
+/*
+ * For checking for bad ranges of xfs_daddr_t's, covering multiple
+ * allocation groups or a single xfs_daddr_t that's a superblock copy.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AG_CHECK_DADDR)
+void xfs_ag_check_daddr(struct xfs_mount *mp, xfs_daddr_t d, xfs_extlen_t len);
+#define XFS_AG_CHECK_DADDR(mp,d,len)	xfs_ag_check_daddr(mp,d,len)
+#else
+#define XFS_AG_CHECK_DADDR(mp,d,len)	\
+	((len) == 1 ? \
+	    ASSERT((d) == XFS_SB_DADDR || \
+		   XFS_DADDR_TO_AGBNO(mp, d) != XFS_SB_DADDR) : \
+	    ASSERT(XFS_DADDR_TO_AGNO(mp, d) == \
+		   XFS_DADDR_TO_AGNO(mp, (d) + (len) - 1)))
+#endif
+
+#endif	/* __XFS_AG_H__ */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
new file mode 100644
index 000000000000..4489ce3a1502
--- /dev/null
+++ b/fs/xfs/xfs_alloc.c
@@ -0,0 +1,2639 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * Free space allocation for XFS.
+ */
+#include <xfs.h>
+
+#if defined(DEBUG)
+/*
+ * Allocation tracing.
+ */
+ktrace_t	*xfs_alloc_trace_buf;
+#endif
+
+#define XFS_ABSDIFF(a,b)	(((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
+
+#define XFSA_FIXUP_BNO_OK	1
+#define XFSA_FIXUP_CNT_OK	2
+
+int
+xfs_alloc_search_busy(xfs_trans_t *tp,
+		    xfs_agnumber_t agno,
+		    xfs_agblock_t bno,
+		    xfs_extlen_t len);
+
+#if defined(XFS_ALLOC_TRACE)
+#define TRACE_ALLOC(s,a)	\
+	xfs_alloc_trace_alloc(fname, s, a, __LINE__)
+#define TRACE_FREE(s,a,b,x,f)	\
+	xfs_alloc_trace_free(fname, s, mp, a, b, x, f, __LINE__)
+#define TRACE_MODAGF(s,a,f)	\
+	xfs_alloc_trace_modagf(fname, s, mp, a, f, __LINE__)
+#define TRACE_BUSY(fname,s,ag,agb,l,sl,tp)	\
+	xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__)
+#define TRACE_UNBUSY(fname,s,ag,sl,tp)	\
+	xfs_alloc_trace_busy(fname, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__)
+#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp)	\
+	xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
+
+
+#else
+#define TRACE_ALLOC(s,a)
+#define TRACE_FREE(s,a,b,x,f)
+#define TRACE_MODAGF(s,a,f)
+#define TRACE_BUSY(s,a,ag,agb,l,sl,tp)
+#define TRACE_UNBUSY(fname,s,ag,sl,tp)
+#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp)
+#endif	/* XFS_ALLOC_TRACE */
+
+/*
+ * Prototypes for per-ag allocation routines
+ */
+
+STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
+STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
+	xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+
+/*
+ * Internal functions.
+ */
+
+/*
+ * Compute aligned version of the found extent.
+ * Takes alignment and min length into account.
+ */
+STATIC int				/* success (>= minlen) */
+xfs_alloc_compute_aligned(
+	xfs_agblock_t	foundbno,	/* starting block in found extent */
+	xfs_extlen_t	foundlen,	/* length in found extent */
+	xfs_extlen_t	alignment,	/* alignment for allocation */
+	xfs_extlen_t	minlen,		/* minimum length for allocation */
+	xfs_agblock_t	*resbno,	/* result block number */
+	xfs_extlen_t	*reslen)	/* result length */
+{
+	xfs_agblock_t	bno;
+	xfs_extlen_t	diff;
+	xfs_extlen_t	len;
+
+	if (alignment > 1 && foundlen >= minlen) {
+		bno = roundup(foundbno, alignment);
+		diff = bno - foundbno;
+		len = diff >= foundlen ? 0 : foundlen - diff;
+	} else {
+		bno = foundbno;
+		len = foundlen;
+	}
+	*resbno = bno;
+	*reslen = len;
+	return len >= minlen;
+}
+
+/*
+ * Compute best start block and diff for "near" allocations.
+ * freelen >= wantlen already checked by caller.
+ */
+STATIC xfs_extlen_t			/* difference value (absolute) */
+xfs_alloc_compute_diff(
+	xfs_agblock_t	wantbno,	/* target starting block */
+	xfs_extlen_t	wantlen,	/* target length */
+	xfs_extlen_t	alignment,	/* target alignment */
+	xfs_agblock_t	freebno,	/* freespace's starting block */
+	xfs_extlen_t	freelen,	/* freespace's length */
+	xfs_agblock_t	*newbnop)	/* result: best start block from free */
+{
+	xfs_agblock_t	freeend;	/* end of freespace extent */
+	xfs_agblock_t	newbno1;	/* return block number */
+	xfs_agblock_t	newbno2;	/* other new block number */
+	xfs_extlen_t	newlen1=0;	/* length with newbno1 */
+	xfs_extlen_t	newlen2=0;	/* length with newbno2 */
+	xfs_agblock_t	wantend;	/* end of target extent */
+
+	ASSERT(freelen >= wantlen);
+	freeend = freebno + freelen;
+	wantend = wantbno + wantlen;
+	if (freebno >= wantbno) {
+		if ((newbno1 = roundup(freebno, alignment)) >= freeend)
+			newbno1 = NULLAGBLOCK;
+	} else if (freeend >= wantend && alignment > 1) {
+		newbno1 = roundup(wantbno, alignment);
+		newbno2 = newbno1 - alignment;
+		if (newbno1 >= freeend)
+			newbno1 = NULLAGBLOCK;
+		else
+			newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
+		if (newbno2 < freebno)
+			newbno2 = NULLAGBLOCK;
+		else
+			newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
+		if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
+			if (newlen1 < newlen2 ||
+			    (newlen1 == newlen2 &&
+			     XFS_ABSDIFF(newbno1, wantbno) >
+			     XFS_ABSDIFF(newbno2, wantbno)))
+				newbno1 = newbno2;
+		} else if (newbno2 != NULLAGBLOCK)
+			newbno1 = newbno2;
+	} else if (freeend >= wantend) {
+		newbno1 = wantbno;
+	} else if (alignment > 1) {
+		newbno1 = roundup(freeend - wantlen, alignment);
+		if (newbno1 > freeend - wantlen &&
+		    newbno1 - alignment >= freebno)
+			newbno1 -= alignment;
+		else if (newbno1 >= freeend)
+			newbno1 = NULLAGBLOCK;
+	} else
+		newbno1 = freeend - wantlen;
+	*newbnop = newbno1;
+	return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
+}
+
+/*
+ * Fix up the length, based on mod and prod.
+ * len should be k * prod + mod for some k.
+ * If len is too small it is returned unchanged.
+ * If len hits maxlen it is left alone.
+ */
+STATIC void
+xfs_alloc_fix_len(
+	xfs_alloc_arg_t *args)		/* allocation argument structure */
+{
+	xfs_extlen_t	k;
+	xfs_extlen_t	rlen;
+
+	ASSERT(args->mod < args->prod);
+	rlen = args->len;
+	ASSERT(rlen >= args->minlen);
+	ASSERT(rlen <= args->maxlen);
+	if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
+	    (args->mod == 0 && rlen < args->prod))
+		return;
+	k = rlen % args->prod;
+	if (k == args->mod)
+		return;
+	if (k > args->mod) {
+		if ((int)(rlen = rlen - k - args->mod) < (int)args->minlen)
+			return;
+	} else {
+		if ((int)(rlen = rlen - args->prod - (args->mod - k)) <
+		    (int)args->minlen)
+			return;
+	}
+	ASSERT(rlen >= args->minlen);
+	ASSERT(rlen <= args->maxlen);
+	args->len = rlen;
+}
+
+/*
+ * Fix up length if there is too little space left in the a.g.
+ * Return 1 if ok, 0 if too little, should give up.
+ */
+STATIC int
+xfs_alloc_fix_minleft(
+	xfs_alloc_arg_t *args)		/* allocation argument structure */
+{
+	xfs_agf_t	*agf;		/* a.g. freelist header */
+	int		diff;		/* free space difference */
+
+	if (args->minleft == 0)
+		return 1;
+	agf = XFS_BUF_TO_AGF(args->agbp);
+	diff = INT_GET(agf->agf_freeblks, ARCH_CONVERT)
+		+ INT_GET(agf->agf_flcount, ARCH_CONVERT)
+		- args->len - args->minleft;
+	if (diff >= 0)
+		return 1;
+	args->len += diff;		/* shrink the allocated space */
+	if (args->len >= args->minlen)
+		return 1;
+	args->agbno = NULLAGBLOCK;
+	return 0;
+}
+
+/*
+ * Update the two btrees, logically removing from freespace the extent
+ * starting at rbno, rlen blocks.  The extent is contained within the
+ * actual (current) free extent fbno for flen blocks.
+ * Flags are passed in indicating whether the cursors are set to the
+ * relevant records.
+ */
+STATIC int				/* error code */
+xfs_alloc_fixup_trees(
+	xfs_btree_cur_t *cnt_cur,	/* cursor for by-size btree */
+	xfs_btree_cur_t *bno_cur,	/* cursor for by-block btree */
+	xfs_agblock_t	fbno,		/* starting block of free extent */
+	xfs_extlen_t	flen,		/* length of free extent */
+	xfs_agblock_t	rbno,		/* starting block of returned extent */
+	xfs_extlen_t	rlen,		/* length of returned extent */
+	int		flags)		/* flags, XFSA_FIXUP_... */
+{
+	int		error;		/* error code */
+	int		i;		/* operation results */
+	xfs_agblock_t	nfbno1;		/* first new free startblock */
+	xfs_agblock_t	nfbno2;		/* second new free startblock */
+	xfs_extlen_t	nflen1=0;	/* first new free length */
+	xfs_extlen_t	nflen2=0;	/* second new free length */
+
+	/*
+	 * Look up the record in the by-size tree if necessary.
+	 */
+	if (flags & XFSA_FIXUP_CNT_OK) {
+#ifdef DEBUG
+		if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(
+			i == 1 && nfbno1 == fbno && nflen1 == flen);
+#endif
+	} else {
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+	/*
+	 * Look up the record in the by-block tree if necessary.
+	 */
+	if (flags & XFSA_FIXUP_BNO_OK) {
+#ifdef DEBUG
+		if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(
+			i == 1 && nfbno1 == fbno && nflen1 == flen);
+#endif
+	} else {
+		if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+#ifdef DEBUG
+	{
+		xfs_alloc_block_t	*bnoblock;
+		xfs_alloc_block_t	*cntblock;
+
+		if (bno_cur->bc_nlevels == 1 &&
+		    cnt_cur->bc_nlevels == 1) {
+			bnoblock = XFS_BUF_TO_ALLOC_BLOCK(bno_cur->bc_bufs[0]);
+			cntblock = XFS_BUF_TO_ALLOC_BLOCK(cnt_cur->bc_bufs[0]);
+			XFS_WANT_CORRUPTED_RETURN(
+				INT_GET(bnoblock->bb_numrecs, ARCH_CONVERT) == INT_GET(cntblock->bb_numrecs, ARCH_CONVERT));
+		}
+	}
+#endif
+	/*
+	 * Deal with all four cases: the allocated record is contained
+	 * within the freespace record, so we can have new freespace
+	 * at either (or both) end, or no freespace remaining.
+	 */
+	if (rbno == fbno && rlen == flen)
+		nfbno1 = nfbno2 = NULLAGBLOCK;
+	else if (rbno == fbno) {
+		nfbno1 = rbno + rlen;
+		nflen1 = flen - rlen;
+		nfbno2 = NULLAGBLOCK;
+	} else if (rbno + rlen == fbno + flen) {
+		nfbno1 = fbno;
+		nflen1 = flen - rlen;
+		nfbno2 = NULLAGBLOCK;
+	} else {
+		nfbno1 = fbno;
+		nflen1 = rbno - fbno;
+		nfbno2 = rbno + rlen;
+		nflen2 = (fbno + flen) - nfbno2;
+	}
+	/*
+	 * Delete the entry from the by-size btree.
+	 */
+	if ((error = xfs_alloc_delete(cnt_cur, &i)))
+		return error;
+	XFS_WANT_CORRUPTED_RETURN(i == 1);
+	/*
+	 * Add new by-size btree entry(s).
+	 */
+	if (nfbno1 != NULLAGBLOCK) {
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 0);
+		if ((error = xfs_alloc_insert(cnt_cur, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+	if (nfbno2 != NULLAGBLOCK) {
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 0);
+		if ((error = xfs_alloc_insert(cnt_cur, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+	/*
+	 * Fix up the by-block btree entry(s).
+	 */
+	if (nfbno1 == NULLAGBLOCK) {
+		/*
+		 * No remaining freespace, just delete the by-block tree entry.
+		 */
+		if ((error = xfs_alloc_delete(bno_cur, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	} else {
+		/*
+		 * Update the by-block entry to start later|be shorter.
+		 */
+		if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
+			return error;
+	}
+	if (nfbno2 != NULLAGBLOCK) {
+		/*
+		 * 2 resulting free entries, need to add one.
+		 */
+		if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 0);
+		if ((error = xfs_alloc_insert(bno_cur, &i)))
+			return error;
+		XFS_WANT_CORRUPTED_RETURN(i == 1);
+	}
+	return 0;
+}
+
+/*
+ * Read in the allocation group free block array.
+ */
+STATIC int				/* error */
+xfs_alloc_read_agfl(
+	xfs_mount_t	*mp,		/* mount point structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	xfs_buf_t	**bpp)		/* buffer for the ag free block array */
+{
+	xfs_buf_t	*bp;		/* return value */
+	xfs_daddr_t	d;		/* disk block address */
+	int		error;
+
+	ASSERT(agno != NULLAGNUMBER);
+	d = XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR);
+	if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, 1, 0, &bp)))
+		return error;
+	ASSERT(bp);
+	ASSERT(!XFS_BUF_GETERROR(bp));
+	XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF);
+	*bpp = bp;
+	return 0;
+}
+
+#if defined(XFS_ALLOC_TRACE)
+/*
+ * Add an allocation trace entry for an alloc call.
+ */
+STATIC void
+xfs_alloc_trace_alloc(
+	char		*name,		/* function tag string */
+	char		*str,		/* additional string */
+	xfs_alloc_arg_t *args,		/* allocation argument structure */
+	int		line)		/* source line number */
+{
+	ktrace_enter(xfs_alloc_trace_buf,
+		(void *)(__psint_t)(XFS_ALLOC_KTRACE_ALLOC | (line << 16)),
+		(void *)name,
+		(void *)str,
+		(void *)args->mp,
+		(void *)(__psunsigned_t)args->agno,
+		(void *)(__psunsigned_t)args->agbno,
+		(void *)(__psunsigned_t)args->minlen,
+		(void *)(__psunsigned_t)args->maxlen,
+		(void *)(__psunsigned_t)args->mod,
+		(void *)(__psunsigned_t)args->prod,
+		(void *)(__psunsigned_t)args->minleft,
+		(void *)(__psunsigned_t)args->total,
+		(void *)(__psunsigned_t)args->alignment,
+		(void *)(__psunsigned_t)args->len,
+		(void *)((((__psint_t)args->type) << 16) |
+			 (__psint_t)args->otype),
+		(void *)(__psint_t)((args->wasdel << 3) |
+				    (args->wasfromfl << 2) |
+				    (args->isfl << 1) |
+				    (args->userdata << 0)));
+}
+
+/*
+ * Add an allocation trace entry for a free call.
+ */
+STATIC void
+xfs_alloc_trace_free(
+	char		*name,		/* function tag string */
+	char		*str,		/* additional string */
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	xfs_agblock_t	agbno,		/* a.g. relative block number */
+	xfs_extlen_t	len,		/* length of extent */
+	int		isfl,		/* set if is freelist allocation/free */
+	int		line)		/* source line number */
+{
+	ktrace_enter(xfs_alloc_trace_buf,
+		(void *)(__psint_t)(XFS_ALLOC_KTRACE_FREE | (line << 16)),
+		(void *)name,
+		(void *)str,
+		(void *)mp,
+		(void *)(__psunsigned_t)agno,
+		(void *)(__psunsigned_t)agbno,
+		(void *)(__psunsigned_t)len,
+		(void *)(__psint_t)isfl,
+		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+}
+
+/*
+ * Add an allocation trace entry for modifying an agf.
+ */
+STATIC void
+xfs_alloc_trace_modagf(
+	char		*name,		/* function tag string */
+	char		*str,		/* additional string */
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_agf_t	*agf,		/* new agf value */
+	int		flags,		/* logging flags for agf */
+	int		line)		/* source line number */
+{
+	ktrace_enter(xfs_alloc_trace_buf,
+		(void *)(__psint_t)(XFS_ALLOC_KTRACE_MODAGF | (line << 16)),
+		(void *)name,
+		(void *)str,
+		(void *)mp,
+		(void *)(__psint_t)flags,
+		(void *)(__psunsigned_t)INT_GET(agf->agf_seqno, ARCH_CONVERT),
+		(void *)(__psunsigned_t)INT_GET(agf->agf_length, ARCH_CONVERT),
+		(void *)(__psunsigned_t)INT_GET(agf->agf_roots[XFS_BTNUM_BNO],
+						ARCH_CONVERT);
+		(void *)(__psunsigned_t)INT_GET(agf->agf_roots[XFS_BTNUM_CNT],
+						ARCH_CONVERT);
+		(void *)(__psunsigned_t)INT_GET(agf->agf_levels[XFS_BTNUM_BNO],
+						ARCH_CONVERT);
+		(void *)(__psunsigned_t)INT_GET(agf->agf_levels[XFS_BTNUM_CNT],
+						ARCH_CONVERT);
+		(void *)(__psunsigned_t)INT_GET(agf->agf_flfirst, ARCH_CONVERT),
+		(void *)(__psunsigned_t)INT_GET(agf->agf_fllast, ARCH_CONVERT),
+		(void *)(__psunsigned_t)INT_GET(agf->agf_flcount, ARCH_CONVERT),
+		(void *)(__psunsigned_t)INT_GET(agf->agf_freeblks, ARCH_CONVERT),
+		(void *)(__psunsigned_t)INT_GET(agf->agf_longest, ARCH_CONVERT));
+}
+
+STATIC void
+xfs_alloc_trace_busy(
+	char		*name,		/* function tag string */
+	char		*str,		/* additional string */
+	xfs_mount_t	*mp,		/* file system mount poing */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	xfs_agblock_t	agbno,		/* a.g. relative block number */
+	xfs_extlen_t	len,		/* length of extent */
+	int		slot,		/* perag Busy slot */
+	xfs_trans_t	*tp,
+	int		trtype,		/* type: add, delete, search */
+	int		line)		/* source line number */
+{
+	ktrace_enter(xfs_alloc_trace_buf,
+		(void *)(__psint_t)(trtype | (line << 16)),
+		(void *)name,
+		(void *)str,
+		(void *)mp,
+		(void *)(__psunsigned_t)agno,
+		(void *)(__psunsigned_t)agbno,
+		(void *)(__psunsigned_t)len,
+		(void *)(__psint_t)slot,
+		(void *)tp,
+		NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+}
+#endif	/* XFS_ALLOC_TRACE */
+
+/*
+ * Allocation group level functions.
+ */
+
+/*
+ * Allocate a variable extent in the allocation group agno.
+ * Type and bno are used to determine where in the allocation group the
+ * extent will start.
+ * Extent's length (returned in *len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int			/* error */
+xfs_alloc_ag_vextent(
+	xfs_alloc_arg_t *args)	/* argument structure for allocation */
+{
+	int		error=0;
+#ifdef XFS_ALLOC_TRACE
+	static char	fname[] = "xfs_alloc_ag_vextent";
+#endif
+
+	ASSERT(args->minlen > 0);
+	ASSERT(args->maxlen > 0);
+	ASSERT(args->minlen <= args->maxlen);
+	ASSERT(args->mod < args->prod);
+	ASSERT(args->alignment > 0);
+	/*
+	 * Branch to correct routine based on the type.
+	 */
+	args->wasfromfl = 0;
+	switch (args->type) {
+	case XFS_ALLOCTYPE_THIS_AG:
+		error = xfs_alloc_ag_vextent_size(args);
+		break;
+	case XFS_ALLOCTYPE_NEAR_BNO:
+		error = xfs_alloc_ag_vextent_near(args);
+		break;
+	case XFS_ALLOCTYPE_THIS_BNO:
+		error = xfs_alloc_ag_vextent_exact(args);
+		break;
+	default:
+		ASSERT(0);
+		/* NOTREACHED */
+	}
+	if (error)
+		return error;
+	/*
+	 * If the allocation worked, need to change the agf structure
+	 * (and log it), and the superblock.
+	 */
+	if (args->agbno != NULLAGBLOCK) {
+		xfs_agf_t	*agf;	/* allocation group freelist header */
+#ifdef XFS_ALLOC_TRACE
+		xfs_mount_t	*mp = args->mp;
+#endif
+		long		slen = (long)args->len;
+
+		ASSERT(args->len >= args->minlen && args->len <= args->maxlen);
+		ASSERT(!(args->wasfromfl) || !args->isfl);
+		ASSERT(args->agbno % args->alignment == 0);
+		if (!(args->wasfromfl)) {
+
+			agf = XFS_BUF_TO_AGF(args->agbp);
+			INT_MOD(agf->agf_freeblks, ARCH_CONVERT, -(args->len));
+			xfs_trans_agblocks_delta(args->tp,
+						 -((long)(args->len)));
+			args->pag->pagf_freeblks -= args->len;
+			ASSERT(INT_GET(agf->agf_freeblks, ARCH_CONVERT)
+				<= INT_GET(agf->agf_length, ARCH_CONVERT));
+			TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS);
+			xfs_alloc_log_agf(args->tp, args->agbp,
+						XFS_AGF_FREEBLKS);
+			/* search the busylist for these blocks */
+			xfs_alloc_search_busy(args->tp, args->agno,
+					args->agbno, args->len);
+		}
+		if (!args->isfl)
+			xfs_trans_mod_sb(args->tp,
+				args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
+					XFS_TRANS_SB_FDBLOCKS, -slen);
+		XFS_STATS_INC(xfsstats.xs_allocx);
+		XFS_STATS_ADD(xfsstats.xs_allocb, args->len);
+	}
+	return 0;
+}
+
+/*
+ * Allocate a variable extent at exactly agno/bno.
+ * Extent's length (returned in *len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
+ */
+STATIC int			/* error */
+xfs_alloc_ag_vextent_exact(
+	xfs_alloc_arg_t *args)	/* allocation argument structure */
+{
+	xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
+	xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
+	xfs_agblock_t	end;	/* end of allocated extent */
+	int		error;
+	xfs_agblock_t	fbno;	/* start block of found extent */
+	xfs_agblock_t	fend;	/* end block of found extent */
+	xfs_extlen_t	flen;	/* length of found extent */
+#ifdef XFS_ALLOC_TRACE
+	static char	fname[] = "xfs_alloc_ag_vextent_exact";
+#endif
+	int		i;	/* success/failure of operation */
+	xfs_agblock_t	maxend; /* end of maximal extent */
+	xfs_agblock_t	minend; /* end of minimal extent */
+	xfs_extlen_t	rlen;	/* length of returned extent */
+
+	ASSERT(args->alignment == 1);
+	/*
+	 * Allocate/initialize a cursor for the by-number freespace btree.
+	 */
+	bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_BNO, 0, 0);
+	/*
+	 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
+	 * Look for the closest free block <= bno, it must contain bno
+	 * if any free block does.
+	 */
+	if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i)))
+		goto error0;
+	if (!i) {
+		/*
+		 * Didn't find it, return null.
+		 */
+		xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+		args->agbno = NULLAGBLOCK;
+		return 0;
+	}
+	/*
+	 * Grab the freespace record.
+	 */
+	if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i)))
+		goto error0;
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	ASSERT(fbno <= args->agbno);
+	minend = args->agbno + args->minlen;
+	maxend = args->agbno + args->maxlen;
+	fend = fbno + flen;
+	/*
+	 * Give up if the freespace isn't long enough for the minimum request.
+	 */
+	if (fend < minend) {
+		xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+		args->agbno = NULLAGBLOCK;
+		return 0;
+	}
+	/*
+	 * End of extent will be smaller of the freespace end and the
+	 * maximal requested end.
+	 */
+	end = XFS_AGBLOCK_MIN(fend, maxend);
+	/*
+	 * Fix the length according to mod and prod if given.
+	 */
+	args->len = end - args->agbno;
+	xfs_alloc_fix_len(args);
+	if (!xfs_alloc_fix_minleft(args)) {
+		xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+		return 0;
+	}
+	rlen = args->len;
+	ASSERT(args->agbno + rlen <= fend);
+	end = args->agbno + rlen;
+	/*
+	 * We are allocating agbno for rlen [agbno .. end]
+	 * Allocate/initialize a cursor for the by-size btree.
+	 */
+	cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_CNT, 0, 0);
+	ASSERT(args->agbno + args->len <=
+		INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length,
+			ARCH_CONVERT));
+	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
+			args->agbno, args->len, XFSA_FIXUP_BNO_OK))) {
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+		goto error0;
+	}
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	TRACE_ALLOC("normal", args);
+	args->wasfromfl = 0;
+	return 0;
+
+error0:
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+	TRACE_ALLOC("error", args);
+	return error;
+}
+
+/*
+ * Allocate a variable extent near bno in the allocation group agno.
+ * Extent's length (returned in len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int				/* error */
+xfs_alloc_ag_vextent_near(
+	xfs_alloc_arg_t *args)		/* allocation argument structure */
+{
+	xfs_btree_cur_t *bno_cur_gt;	/* cursor for bno btree, right side */
+	xfs_btree_cur_t *bno_cur_lt;	/* cursor for bno btree, left side */
+	xfs_btree_cur_t *cnt_cur;	/* cursor for count btree */
+#ifdef XFS_ALLOC_TRACE
+	static char	fname[] = "xfs_alloc_ag_vextent_near";
+#endif
+	xfs_agblock_t	gtbno;		/* start bno of right side entry */
+	xfs_agblock_t	gtbnoa;		/* aligned ... */
+	xfs_extlen_t	gtdiff;		/* difference to right side entry */
+	xfs_extlen_t	gtlen;		/* length of right side entry */
+	xfs_extlen_t	gtlena;		/* aligned ... */
+	xfs_agblock_t	gtnew;		/* useful start bno of right side */
+	int		error;		/* error code */
+	int		i;		/* result code, temporary */
+	int		j;		/* result code, temporary */
+	xfs_agblock_t	ltbno;		/* start bno of left side entry */
+	xfs_agblock_t	ltbnoa;		/* aligned ... */
+	xfs_extlen_t	ltdiff;		/* difference to left side entry */
+	/*REFERENCED*/
+	xfs_agblock_t	ltend;		/* end bno of left side entry */
+	xfs_extlen_t	ltlen;		/* length of left side entry */
+	xfs_extlen_t	ltlena;		/* aligned ... */
+	xfs_agblock_t	ltnew;		/* useful start bno of left side */
+	xfs_extlen_t	rlen;		/* length of returned extent */
+#if defined(DEBUG) && defined(__KERNEL__)
+	/*
+	 * Randomly don't execute the first algorithm.
+	 */
+	static int	seed;		/* randomizing seed value */
+	int		dofirst;	/* set to do first algorithm */
+	timespec_t	now;		/* current time */
+
+	if (!seed) {
+		nanotime(&now);
+		seed = (int)now.tv_sec ^ (int)now.tv_nsec;
+	}
+	dofirst = random() & 1;
+#endif
+	/*
+	 * Get a cursor for the by-size btree.
+	 */
+	cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_CNT, 0, 0);
+	ltlen = 0;
+	bno_cur_lt = bno_cur_gt = NULL;
+	/*
+	 * See if there are any free extents as big as maxlen.
+	 */
+	if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen, &i)))
+		goto error0;
+	/*
+	 * If none, then pick up the last entry in the tree unless the
+	 * tree is empty.
+	 */
+	if (!i) {
+		if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &ltbno,
+				&ltlen, &i)))
+			goto error0;
+		if (i == 0 || ltlen == 0) {
+			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			return 0;
+		}
+		ASSERT(i == 1);
+	}
+	args->wasfromfl = 0;
+	/*
+	 * First algorithm.
+	 * If the requested extent is large wrt the freespaces available
+	 * in this a.g., then the cursor will be pointing to a btree entry
+	 * near the right edge of the tree.  If it's in the last btree leaf
+	 * block, then we just examine all the entries in that block
+	 * that are big enough, and pick the best one.
+	 * This is written as a while loop so we can break out of it,
+	 * but we never loop back to the top.
+	 */
+	while (xfs_btree_islastblock(cnt_cur, 0)) {
+		xfs_extlen_t	bdiff;
+		int		besti=0;
+		xfs_extlen_t	blen=0;
+		xfs_agblock_t	bnew=0;
+
+#if defined(DEBUG) && defined(__KERNEL__)
+		if (!dofirst)
+			break;
+#endif
+		/*
+		 * Start from the entry that lookup found, sequence through
+		 * all larger free blocks.  If we're actually pointing at a
+		 * record smaller than maxlen, go to the start of this block,
+		 * and skip all those smaller than minlen.
+		 */
+		if (ltlen || args->alignment > 1) {
+			cnt_cur->bc_ptrs[0] = 1;
+			do {
+				if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno,
+						&ltlen, &i)))
+					goto error0;
+				XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+				if (ltlen >= args->minlen)
+					break;
+				if ((error = xfs_alloc_increment(cnt_cur, 0, &i)))
+					goto error0;
+			} while (i);
+			ASSERT(ltlen >= args->minlen);
+			if (!i)
+				break;
+		}
+		i = cnt_cur->bc_ptrs[0];
+		for (j = 1, blen = 0, bdiff = 0;
+		     !error && j && (blen < args->maxlen || bdiff > 0);
+		     error = xfs_alloc_increment(cnt_cur, 0, &j)) {
+			/*
+			 * For each entry, decide if it's better than
+			 * the previous best entry.
+			 */
+			if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			if (!xfs_alloc_compute_aligned(ltbno, ltlen,
+					args->alignment, args->minlen,
+					&ltbnoa, &ltlena))
+				continue;
+			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+			xfs_alloc_fix_len(args);
+			ASSERT(args->len >= args->minlen);
+			if (args->len < blen)
+				continue;
+			ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
+				args->alignment, ltbno, ltlen, &ltnew);
+			if (ltnew != NULLAGBLOCK &&
+			    (args->len > blen || ltdiff < bdiff)) {
+				bdiff = ltdiff;
+				bnew = ltnew;
+				blen = args->len;
+				besti = cnt_cur->bc_ptrs[0];
+			}
+		}
+		/*
+		 * It didn't work.  We COULD be in a case where
+		 * there's a good record somewhere, so try again.
+		 */
+		if (blen == 0)
+			break;
+		/*
+		 * Point at the best entry, and retrieve it again.
+		 */
+		cnt_cur->bc_ptrs[0] = besti;
+		if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		ltend = ltbno + ltlen;
+		ASSERT(ltend <= INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length,
+				ARCH_CONVERT));
+		args->len = blen;
+		if (!xfs_alloc_fix_minleft(args)) {
+			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			TRACE_ALLOC("nominleft", args);
+			return 0;
+		}
+		blen = args->len;
+		/*
+		 * We are allocating starting at bnew for blen blocks.
+		 */
+		args->agbno = bnew;
+		ASSERT(bnew >= ltbno);
+		ASSERT(bnew + blen <= ltend);
+		/*
+		 * Set up a cursor for the by-bno tree.
+		 */
+		bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp,
+			args->agbp, args->agno, XFS_BTNUM_BNO, 0, 0);
+		/*
+		 * Fix up the btree entries.
+		 */
+		if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno,
+				ltlen, bnew, blen, XFSA_FIXUP_CNT_OK)))
+			goto error0;
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+		TRACE_ALLOC("first", args);
+		return 0;
+	}
+	/*
+	 * Second algorithm.
+	 * Search in the by-bno tree to the left and to the right
+	 * simultaneously, until in each case we find a space big enough,
+	 * or run into the edge of the tree.  When we run into the edge,
+	 * we deallocate that cursor.
+	 * If both searches succeed, we compare the two spaces and pick
+	 * the better one.
+	 * With alignment, it's possible for both to fail; the upper
+	 * level algorithm that picks allocation groups for allocations
+	 * is not supposed to do this.
+	 */
+	/*
+	 * Allocate and initialize the cursor for the leftward search.
+	 */
+	bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_BNO, 0, 0);
+	/*
+	 * Lookup <= bno to find the leftward search's starting point.
+	 */
+	if ((error = xfs_alloc_lookup_le(bno_cur_lt, args->agbno, args->maxlen, &i)))
+		goto error0;
+	if (!i) {
+		/*
+		 * Didn't find anything; use this cursor for the rightward
+		 * search.
+		 */
+		bno_cur_gt = bno_cur_lt;
+		bno_cur_lt = 0;
+	}
+	/*
+	 * Found something.  Duplicate the cursor for the rightward search.
+	 */
+	else if ((error = xfs_btree_dup_cursor(bno_cur_lt, &bno_cur_gt)))
+		goto error0;
+	/*
+	 * Increment the cursor, so we will point at the entry just right
+	 * of the leftward entry if any, or to the leftmost entry.
+	 */
+	if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
+		goto error0;
+	if (!i) {
+		/*
+		 * It failed, there are no rightward entries.
+		 */
+		xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR);
+		bno_cur_gt = NULL;
+	}
+	/*
+	 * Loop going left with the leftward cursor, right with the
+	 * rightward cursor, until either both directions give up or
+	 * we find an entry at least as big as minlen.
+	 */
+	do {
+		if (bno_cur_lt) {
+			if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			if (xfs_alloc_compute_aligned(ltbno, ltlen,
+					args->alignment, args->minlen,
+					&ltbnoa, &ltlena))
+				break;
+			if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i)))
+				goto error0;
+			if (!i) {
+				xfs_btree_del_cursor(bno_cur_lt,
+						     XFS_BTREE_NOERROR);
+				bno_cur_lt = NULL;
+			}
+		}
+		if (bno_cur_gt) {
+			if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			if (xfs_alloc_compute_aligned(gtbno, gtlen,
+					args->alignment, args->minlen,
+					&gtbnoa, &gtlena))
+				break;
+			if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
+				goto error0;
+			if (!i) {
+				xfs_btree_del_cursor(bno_cur_gt,
+						     XFS_BTREE_NOERROR);
+				bno_cur_gt = NULL;
+			}
+		}
+	} while (bno_cur_lt || bno_cur_gt);
+	/*
+	 * Got both cursors still active, need to find better entry.
+	 */
+	if (bno_cur_lt && bno_cur_gt) {
+		/*
+		 * Left side is long enough, look for a right side entry.
+		 */
+		if (ltlena >= args->minlen) {
+			/*
+			 * Fix up the length.
+			 */
+			args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+			xfs_alloc_fix_len(args);
+			rlen = args->len;
+			ltdiff = xfs_alloc_compute_diff(args->agbno, rlen,
+				args->alignment, ltbno, ltlen, &ltnew);
+			/*
+			 * Not perfect.
+			 */
+			if (ltdiff) {
+				/*
+				 * Look until we find a better one, run out of
+				 * space, or run off the end.
+				 */
+				while (bno_cur_lt && bno_cur_gt) {
+					if ((error = xfs_alloc_get_rec(
+							bno_cur_gt, &gtbno,
+							&gtlen, &i)))
+						goto error0;
+					XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+					xfs_alloc_compute_aligned(gtbno, gtlen,
+						args->alignment, args->minlen,
+						&gtbnoa, &gtlena);
+					/*
+					 * The left one is clearly better.
+					 */
+					if (gtbnoa >= args->agbno + ltdiff) {
+						xfs_btree_del_cursor(
+							bno_cur_gt,
+							XFS_BTREE_NOERROR);
+						bno_cur_gt = NULL;
+						break;
+					}
+					/*
+					 * If we reach a big enough entry,
+					 * compare the two and pick the best.
+					 */
+					if (gtlena >= args->minlen) {
+						args->len =
+							XFS_EXTLEN_MIN(gtlena,
+								args->maxlen);
+						xfs_alloc_fix_len(args);
+						rlen = args->len;
+						gtdiff = xfs_alloc_compute_diff(
+							args->agbno, rlen,
+							args->alignment,
+							gtbno, gtlen, &gtnew);
+						/*
+						 * Right side is better.
+						 */
+						if (gtdiff < ltdiff) {
+							xfs_btree_del_cursor(
+								bno_cur_lt,
+								XFS_BTREE_NOERROR);
+							bno_cur_lt = NULL;
+						}
+						/*
+						 * Left side is better.
+						 */
+						else {
+							xfs_btree_del_cursor(
+								bno_cur_gt,
+								XFS_BTREE_NOERROR);
+							bno_cur_gt = NULL;
+						}
+						break;
+					}
+					/*
+					 * Fell off the right end.
+					 */
+					if ((error = xfs_alloc_increment(
+							bno_cur_gt, 0, &i)))
+						goto error0;
+					if (!i) {
+						xfs_btree_del_cursor(
+							bno_cur_gt,
+							XFS_BTREE_NOERROR);
+						bno_cur_gt = NULL;
+						break;
+					}
+				}
+			}
+			/*
+			 * The left side is perfect, trash the right side.
+			 */
+			else {
+				xfs_btree_del_cursor(bno_cur_gt,
+						     XFS_BTREE_NOERROR);
+				bno_cur_gt = NULL;
+			}
+		}
+		/*
+		 * It's the right side that was found first, look left.
+		 */
+		else {
+			/*
+			 * Fix up the length.
+			 */
+			args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
+			xfs_alloc_fix_len(args);
+			rlen = args->len;
+			gtdiff = xfs_alloc_compute_diff(args->agbno, rlen,
+				args->alignment, gtbno, gtlen, &gtnew);
+			/*
+			 * Right side entry isn't perfect.
+			 */
+			if (gtdiff) {
+				/*
+				 * Look until we find a better one, run out of
+				 * space, or run off the end.
+				 */
+				while (bno_cur_lt && bno_cur_gt) {
+					if ((error = xfs_alloc_get_rec(
+							bno_cur_lt, &ltbno,
+							&ltlen, &i)))
+						goto error0;
+					XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+					xfs_alloc_compute_aligned(ltbno, ltlen,
+						args->alignment, args->minlen,
+						&ltbnoa, &ltlena);
+					/*
+					 * The right one is clearly better.
+					 */
+					if (ltbnoa <= args->agbno - gtdiff) {
+						xfs_btree_del_cursor(
+							bno_cur_lt,
+							XFS_BTREE_NOERROR);
+						bno_cur_lt = NULL;
+						break;
+					}
+					/*
+					 * If we reach a big enough entry,
+					 * compare the two and pick the best.
+					 */
+					if (ltlena >= args->minlen) {
+						args->len = XFS_EXTLEN_MIN(
+							ltlena, args->maxlen);
+						xfs_alloc_fix_len(args);
+						rlen = args->len;
+						ltdiff = xfs_alloc_compute_diff(
+							args->agbno, rlen,
+							args->alignment,
+							ltbno, ltlen, &ltnew);
+						/*
+						 * Left side is better.
+						 */
+						if (ltdiff < gtdiff) {
+							xfs_btree_del_cursor(
+								bno_cur_gt,
+								XFS_BTREE_NOERROR);
+							bno_cur_gt = NULL;
+						}
+						/*
+						 * Right side is better.
+						 */
+						else {
+							xfs_btree_del_cursor(
+								bno_cur_lt,
+								XFS_BTREE_NOERROR);
+							bno_cur_lt = NULL;
+						}
+						break;
+					}
+					/*
+					 * Fell off the left end.
+					 */
+					if ((error = xfs_alloc_decrement(
+							bno_cur_lt, 0, &i)))
+						goto error0;
+					if (!i) {
+						xfs_btree_del_cursor(bno_cur_lt,
+							XFS_BTREE_NOERROR);
+						bno_cur_lt = NULL;
+						break;
+					}
+				}
+			}
+			/*
+			 * The right side is perfect, trash the left side.
+			 */
+			else {
+				xfs_btree_del_cursor(bno_cur_lt,
+					XFS_BTREE_NOERROR);
+				bno_cur_lt = NULL;
+			}
+		}
+	}
+	/*
+	 * If we couldn't get anything, give up.
+	 */
+	if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
+		TRACE_ALLOC("neither", args);
+		args->agbno = NULLAGBLOCK;
+		return 0;
+	}
+	/*
+	 * At this point we have selected a freespace entry, either to the
+	 * left or to the right.  If it's on the right, copy all the
+	 * useful variables to the "left" set so we only have one
+	 * copy of this code.
+	 */
+	if (bno_cur_gt) {
+		bno_cur_lt = bno_cur_gt;
+		bno_cur_gt = NULL;
+		ltbno = gtbno;
+		ltbnoa = gtbnoa;
+		ltlen = gtlen;
+		ltlena = gtlena;
+		j = 1;
+	} else
+		j = 0;
+	/*
+	 * Fix up the length and compute the useful address.
+	 */
+	ltend = ltbno + ltlen;
+	args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
+	xfs_alloc_fix_len(args);
+	if (!xfs_alloc_fix_minleft(args)) {
+		TRACE_ALLOC("nominleft", args);
+		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+		return 0;
+	}
+	rlen = args->len;
+	(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno,
+		ltlen, &ltnew);
+	ASSERT(ltnew >= ltbno);
+	ASSERT(ltnew + rlen <= ltend);
+	ASSERT(ltnew + rlen <= INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length,
+		ARCH_CONVERT));
+	args->agbno = ltnew;
+	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
+			ltnew, rlen, XFSA_FIXUP_BNO_OK)))
+		goto error0;
+	TRACE_ALLOC(j ? "gt" : "lt", args);
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
+	return 0;
+
+ error0:
+	TRACE_ALLOC("error", args);
+	if (cnt_cur != NULL)
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+	if (bno_cur_lt != NULL)
+		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_ERROR);
+	if (bno_cur_gt != NULL)
+		xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Allocate a variable extent anywhere in the allocation group agno.
+ * Extent's length (returned in len) will be between minlen and maxlen,
+ * and of the form k * prod + mod unless there's nothing that large.
+ * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
+ */
+STATIC int				/* error */
+xfs_alloc_ag_vextent_size(
+	xfs_alloc_arg_t *args)		/* allocation argument structure */
+{
+	xfs_btree_cur_t *bno_cur;	/* cursor for bno btree */
+	xfs_btree_cur_t *cnt_cur;	/* cursor for cnt btree */
+	int		error;		/* error result */
+	xfs_agblock_t	fbno;		/* start of found freespace */
+	xfs_extlen_t	flen;		/* length of found freespace */
+#ifdef XFS_ALLOC_TRACE
+	static char	fname[] = "xfs_alloc_ag_vextent_size";
+#endif
+	int		i;		/* temp status variable */
+	xfs_agblock_t	rbno;		/* returned block number */
+	xfs_extlen_t	rlen;		/* length of returned extent */
+
+	/*
+	 * Allocate and initialize a cursor for the by-size btree.
+	 */
+	cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_CNT, 0, 0);
+	bno_cur = NULL;
+	/*
+	 * Look for an entry >= maxlen+alignment-1 blocks.
+	 */
+	if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
+			args->maxlen + args->alignment - 1, &i)))
+		goto error0;
+	/*
+	 * If none, then pick up the last entry in the tree unless the
+	 * tree is empty.
+	 */
+	if (!i) {
+		if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno,
+				&flen, &i)))
+			goto error0;
+		if (i == 0 || flen == 0) {
+			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+			TRACE_ALLOC("noentry", args);
+			return 0;
+		}
+		ASSERT(i == 1);
+	}
+	/*
+	 * There's a freespace as big as maxlen+alignment-1, get it.
+	 */
+	else {
+		if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	}
+	/*
+	 * In the first case above, we got the last entry in the
+	 * by-size btree.  Now we check to see if the space hits maxlen
+	 * once aligned; if not, we search left for something better.
+	 * This can't happen in the second case above.
+	 */
+	xfs_alloc_compute_aligned(fbno, flen, args->alignment, args->minlen,
+		&rbno, &rlen);
+	rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
+	XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+			(rlen <= flen && rbno + rlen <= fbno + flen), error0);
+	if (rlen < args->maxlen) {
+		xfs_agblock_t	bestfbno;
+		xfs_extlen_t	bestflen;
+		xfs_agblock_t	bestrbno;
+		xfs_extlen_t	bestrlen;
+
+		bestrlen = rlen;
+		bestrbno = rbno;
+		bestflen = flen;
+		bestfbno = fbno;
+		for (;;) {
+			if ((error = xfs_alloc_decrement(cnt_cur, 0, &i)))
+				goto error0;
+			if (i == 0)
+				break;
+			if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
+					&i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			if (flen < bestrlen)
+				break;
+			xfs_alloc_compute_aligned(fbno, flen, args->alignment,
+				args->minlen, &rbno, &rlen);
+			rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
+			XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+				(rlen <= flen && rbno + rlen <= fbno + flen),
+				error0);
+			if (rlen > bestrlen) {
+				bestrlen = rlen;
+				bestrbno = rbno;
+				bestflen = flen;
+				bestfbno = fbno;
+				if (rlen == args->maxlen)
+					break;
+			}
+		}
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
+				&i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		rlen = bestrlen;
+		rbno = bestrbno;
+		flen = bestflen;
+		fbno = bestfbno;
+	}
+	args->wasfromfl = 0;
+	/*
+	 * Fix up the length.
+	 */
+	args->len = rlen;
+	xfs_alloc_fix_len(args);
+	if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) {
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+		TRACE_ALLOC("nominleft", args);
+		args->agbno = NULLAGBLOCK;
+		return 0;
+	}
+	rlen = args->len;
+	XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
+	/*
+	 * Allocate and initialize a cursor for the by-block tree.
+	 */
+	bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+		args->agno, XFS_BTNUM_BNO, 0, 0);
+	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
+			rbno, rlen, XFSA_FIXUP_CNT_OK)))
+		goto error0;
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	cnt_cur = bno_cur = NULL;
+	args->len = rlen;
+	args->agbno = rbno;
+	XFS_WANT_CORRUPTED_GOTO(
+		args->agbno + args->len <=
+			INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length,
+			ARCH_CONVERT),
+		error0);
+	TRACE_ALLOC("normal", args);
+	return 0;
+
+error0:
+	TRACE_ALLOC("error", args);
+	if (cnt_cur)
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+	if (bno_cur)
+		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Deal with the case where only small freespaces remain.
+ * Either return the contents of the last freespace record,
+ * or allocate space from the freelist if there is nothing in the tree.
+ */
+STATIC int			/* error */
+xfs_alloc_ag_vextent_small(
+	xfs_alloc_arg_t *args,	/* allocation argument structure */
+	xfs_btree_cur_t *ccur,	/* by-size cursor */
+	xfs_agblock_t	*fbnop, /* result block number */
+	xfs_extlen_t	*flenp, /* result length */
+	int		*stat)	/* status: 0-freelist, 1-normal/none */
+{
+	int		error;
+	xfs_agblock_t	fbno;
+	xfs_extlen_t	flen;
+#ifdef XFS_ALLOC_TRACE
+	static char	fname[] = "xfs_alloc_ag_vextent_small";
+#endif
+	int		i;
+
+	if ((error = xfs_alloc_decrement(ccur, 0, &i)))
+		goto error0;
+	if (i) {
+		if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	}
+	/*
+	 * Nothing in the btree, try the freelist.  Make sure
+	 * to respect minleft even when pulling from the
+	 * freelist.
+	 */
+	else if (args->minlen == 1 && args->alignment == 1 && !args->isfl &&
+		 (INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_flcount,
+			ARCH_CONVERT) > args->minleft)) {
+		if ((error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno)))
+			goto error0;
+		if (fbno != NULLAGBLOCK) {
+			if (args->userdata) {
+				xfs_buf_t	*bp;
+
+				bp = xfs_btree_get_bufs(args->mp, args->tp,
+					args->agno, fbno, 0);
+				xfs_trans_binval(args->tp, bp);
+			}
+			args->len = 1;
+			args->agbno = fbno;
+			XFS_WANT_CORRUPTED_GOTO(
+				args->agbno + args->len <=
+				INT_GET(XFS_BUF_TO_AGF(args->agbp)->agf_length,
+					ARCH_CONVERT),
+				error0);
+			args->wasfromfl = 1;
+			TRACE_ALLOC("freelist", args);
+			*stat = 0;
+			return 0;
+		}
+		/*
+		 * Nothing in the freelist.
+		 */
+		else
+			flen = 0;
+	}
+	/*
+	 * Can't allocate from the freelist for some reason.
+	 */
+	else
+		flen = 0;
+	/*
+	 * Can't do the allocation, give up.
+	 */
+	if (flen < args->minlen) {
+		args->agbno = NULLAGBLOCK;
+		TRACE_ALLOC("notenough", args);
+		flen = 0;
+	}
+	*fbnop = fbno;
+	*flenp = flen;
+	*stat = 1;
+	TRACE_ALLOC("normal", args);
+	return 0;
+
+error0:
+	TRACE_ALLOC("error", args);
+	return error;
+}
+
+/*
+ * Free the extent starting at agno/bno for length.
+ */
+STATIC int			/* error */
+xfs_free_ag_extent(
+	xfs_trans_t	*tp,	/* transaction pointer */
+	xfs_buf_t	*agbp,	/* buffer for a.g. freelist header */
+	xfs_agnumber_t	agno,	/* allocation group number */
+	xfs_agblock_t	bno,	/* starting block number */
+	xfs_extlen_t	len,	/* length of extent */
+	int		isfl)	/* set if is freelist blocks - no sb acctg */
+{
+	xfs_btree_cur_t *bno_cur;	/* cursor for by-block btree */
+	xfs_btree_cur_t *cnt_cur;	/* cursor for by-size btree */
+	int		error;		/* error return value */
+#ifdef XFS_ALLOC_TRACE
+	static char	fname[] = "xfs_free_ag_extent";
+#endif
+	xfs_agblock_t	gtbno;		/* start of right neighbor block */
+	xfs_extlen_t	gtlen;		/* length of right neighbor block */
+	int		haveleft;	/* have a left neighbor block */
+	int		haveright;	/* have a right neighbor block */
+	int		i;		/* temp, result code */
+	xfs_agblock_t	ltbno;		/* start of left neighbor block */
+	xfs_extlen_t	ltlen;		/* length of left neighbor block */
+	xfs_mount_t	*mp;		/* mount point struct for filesystem */
+	xfs_agblock_t	nbno;		/* new starting block of freespace */
+	xfs_extlen_t	nlen;		/* new length of freespace */
+
+	mp = tp->t_mountp;
+	/*
+	 * Allocate and initialize a cursor for the by-block btree.
+	 */
+	bno_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO, 0,
+		0);
+	cnt_cur = NULL;
+	/*
+	 * Look for a neighboring block on the left (lower block numbers)
+	 * that is contiguous with this space.
+	 */
+	if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
+		goto error0;
+	if (haveleft) {
+		/*
+		 * There is a block to our left.
+		 */
+		if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * It's not contiguous, though.
+		 */
+		if (ltbno + ltlen < bno)
+			haveleft = 0;
+		else {
+			/*
+			 * If this failure happens the request to free this
+			 * space was invalid, it's (partly) already free.
+			 * Very bad.
+			 */
+			XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0);
+		}
+	}
+	/*
+	 * Look for a neighboring block on the right (higher block numbers)
+	 * that is contiguous with this space.
+	 */
+	if ((error = xfs_alloc_increment(bno_cur, 0, &haveright)))
+		goto error0;
+	if (haveright) {
+		/*
+		 * There is a block to our right.
+		 */
+		if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * It's not contiguous, though.
+		 */
+		if (bno + len < gtbno)
+			haveright = 0;
+		else {
+			/*
+			 * If this failure happens the request to free this
+			 * space was invalid, it's (partly) already free.
+			 * Very bad.
+			 */
+			XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0);
+		}
+	}
+	/*
+	 * Now allocate and initialize a cursor for the by-size tree.
+	 */
+	cnt_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT, 0,
+		0);
+	/*
+	 * Have both left and right contiguous neighbors.
+	 * Merge all three into a single free block.
+	 */
+	if (haveleft && haveright) {
+		/*
+		 * Delete the old by-size entry on the left.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if ((error = xfs_alloc_delete(cnt_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Delete the old by-size entry on the right.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if ((error = xfs_alloc_delete(cnt_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Delete the old by-block entry for the right block.
+		 */
+		if ((error = xfs_alloc_delete(bno_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Move the by-block cursor back to the left neighbor.
+		 */
+		if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+#ifdef DEBUG
+		/*
+		 * Check that this is the right record: delete didn't
+		 * mangle the cursor.
+		 */
+		{
+			xfs_agblock_t	xxbno;
+			xfs_extlen_t	xxlen;
+
+			if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
+					&i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(
+				i == 1 && xxbno == ltbno && xxlen == ltlen,
+				error0);
+		}
+#endif
+		/*
+		 * Update remaining by-block entry to the new, joined block.
+		 */
+		nbno = ltbno;
+		nlen = len + ltlen + gtlen;
+		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+			goto error0;
+	}
+	/*
+	 * Have only a left contiguous neighbor.
+	 * Merge it together with the new freespace.
+	 */
+	else if (haveleft) {
+		/*
+		 * Delete the old by-size entry on the left.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if ((error = xfs_alloc_delete(cnt_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Back up the by-block cursor to the left neighbor, and
+		 * update its length.
+		 */
+		if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		nbno = ltbno;
+		nlen = len + ltlen;
+		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+			goto error0;
+	}
+	/*
+	 * Have only a right contiguous neighbor.
+	 * Merge it together with the new freespace.
+	 */
+	else if (haveright) {
+		/*
+		 * Delete the old by-size entry on the right.
+		 */
+		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if ((error = xfs_alloc_delete(cnt_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Update the starting block and length of the right
+		 * neighbor in the by-block tree.
+		 */
+		nbno = bno;
+		nlen = len + gtlen;
+		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
+			goto error0;
+	}
+	/*
+	 * No contiguous neighbors.
+	 * Insert the new freespace into the by-block tree.
+	 */
+	else {
+		nbno = bno;
+		nlen = len;
+		if ((error = xfs_alloc_insert(bno_cur, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	}
+	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
+	bno_cur = NULL;
+	/*
+	 * In all cases we need to insert the new freespace in the by-size tree.
+	 */
+	if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
+		goto error0;
+	XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
+	if ((error = xfs_alloc_insert(cnt_cur, &i)))
+		goto error0;
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+	cnt_cur = NULL;
+	/*
+	 * Update the freespace totals in the ag and superblock.
+	 */
+	{
+		xfs_agf_t	*agf;
+		xfs_perag_t	*pag;		/* per allocation group data */
+
+		agf = XFS_BUF_TO_AGF(agbp);
+		pag = &mp->m_perag[agno];
+		INT_MOD(agf->agf_freeblks, ARCH_CONVERT, len);
+		xfs_trans_agblocks_delta(tp, len);
+		pag->pagf_freeblks += len;
+		XFS_WANT_CORRUPTED_GOTO(
+			INT_GET(agf->agf_freeblks, ARCH_CONVERT)
+				<= INT_GET(agf->agf_length, ARCH_CONVERT),
+			error0);
+		TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS);
+		xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
+		if (!isfl)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
+		XFS_STATS_INC(xfsstats.xs_freex);
+		XFS_STATS_ADD(xfsstats.xs_freeb, len);
+	}
+	TRACE_FREE(haveleft ?
+			(haveright ? "both" : "left") :
+			(haveright ? "right" : "none"),
+		agno, bno, len, isfl);
+
+	/*
+	 * Since blocks move to the free list without the coordination
+	 * used in xfs_bmap_finish, we can't allow block to be available
+	 * for reallocation and non-transaction writing (user data)
+	 * until we know that the transaction that moved it to the free
+	 * list is permanently on disk.	 We track the blocks by declaring
+	 * these blocks as "busy"; the busy list is maintained on a per-ag
+	 * basis and each transaction records which entries should be removed
+	 * when the iclog commits to disk.  If a busy block is allocated,
+	 * the iclog is pushed up to the LSN that freed the block.
+	 */
+	xfs_alloc_mark_busy(tp, agno, bno, len);
+	return 0;
+
+ error0:
+	TRACE_FREE("error", agno, bno, len, isfl);
+	if (bno_cur)
+		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
+	if (cnt_cur)
+		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Visible (exported) allocation/free functions.
+ * Some of these are used just by xfs_alloc_btree.c and this file.
+ */
+
+/*
+ * Compute and fill in value of m_ag_maxlevels.
+ */
+void
+xfs_alloc_compute_maxlevels(
+	xfs_mount_t	*mp)	/* file system mount structure */
+{
+	int		level;
+	uint		maxblocks;
+	uint		maxleafents;
+	int		minleafrecs;
+	int		minnoderecs;
+
+	maxleafents = (mp->m_sb.sb_agblocks + 1) / 2;
+	minleafrecs = mp->m_alloc_mnr[0];
+	minnoderecs = mp->m_alloc_mnr[1];
+	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+	for (level = 1; maxblocks > 1; level++)
+		maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+	mp->m_ag_maxlevels = level;
+}
+
+/*
+ * Decide whether to use this allocation group for this allocation.
+ * If so, fix up the btree freelist's size.
+ * This is external so mkfs can call it, too.
+ */
+int				/* error */
+xfs_alloc_fix_freelist(
+	xfs_alloc_arg_t *args,	/* allocation argument structure */
+	int		flags)	/* XFS_ALLOC_FLAG_... */
+{
+	xfs_buf_t	*agbp;	/* agf buffer pointer */
+	xfs_agf_t	*agf;	/* a.g. freespace structure pointer */
+	xfs_buf_t	*agflbp;/* agfl buffer pointer */
+	xfs_agblock_t	bno;	/* freelist block */
+	xfs_extlen_t	delta;	/* new blocks needed in freelist */
+	int		error;	/* error result code */
+	xfs_extlen_t	longest;/* longest extent in allocation group */
+	xfs_mount_t	*mp;	/* file system mount point structure */
+	xfs_extlen_t	need;	/* total blocks needed in freelist */
+	xfs_perag_t	*pag;	/* per-ag information structure */
+	xfs_alloc_arg_t targs;	/* local allocation arguments */
+	xfs_trans_t	*tp;	/* transaction pointer */
+
+	mp = args->mp;
+
+	pag = args->pag;
+	tp = args->tp;
+	if (!pag->pagf_init) {
+		if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
+				&agbp)))
+			return error;
+		if (!pag->pagf_init) {
+			args->agbp = NULL;
+			return 0;
+		}
+	} else
+		agbp = NULL;
+
+	/* If this is a metadata prefered pag and we are user data
+	 * then try somewhere else if we are not being asked to
+	 * try harder at this point
+	 */
+	if (pag->pagf_metadata && args->userdata && flags) {
+		args->agbp = NULL;
+		return 0;
+	}
+
+	need = XFS_MIN_FREELIST_PAG(pag, mp);
+	delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0;
+	/*
+	 * If it looks like there isn't a long enough extent, or enough
+	 * total blocks, reject it.
+	 */
+	longest = (pag->pagf_longest > delta) ?
+		(pag->pagf_longest - delta) :
+		(pag->pagf_flcount > 0 || pag->pagf_longest > 0);
+	if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
+	    (args->minleft &&
+	     (int)(pag->pagf_freeblks + pag->pagf_flcount -
+		   need - args->total) <
+	     (int)args->minleft)) {
+		if (agbp)
+			xfs_trans_brelse(tp, agbp);
+		args->agbp = NULL;
+		return 0;
+	}
+	/*
+	 * Get the a.g. freespace buffer.
+	 * Can fail if we're not blocking on locks, and it's held.
+	 */
+	if (agbp == NULL) {
+		if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags,
+				&agbp)))
+			return error;
+		if (agbp == NULL) {
+			args->agbp = NULL;
+			return 0;
+		}
+	}
+	/*
+	 * Figure out how many blocks we should have in the freelist.
+	 */
+	agf = XFS_BUF_TO_AGF(agbp);
+	need = XFS_MIN_FREELIST(agf, mp);
+	delta = need > INT_GET(agf->agf_flcount, ARCH_CONVERT) ?
+		(need - INT_GET(agf->agf_flcount, ARCH_CONVERT)) : 0;
+	/*
+	 * If there isn't enough total or single-extent, reject it.
+	 */
+	longest = INT_GET(agf->agf_longest, ARCH_CONVERT);
+	longest = (longest > delta) ? (longest - delta) :
+		(INT_GET(agf->agf_flcount, ARCH_CONVERT) > 0 || longest > 0);
+	if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
+	     (args->minleft &&
+		(int)(INT_GET(agf->agf_freeblks, ARCH_CONVERT) +
+		   INT_GET(agf->agf_flcount, ARCH_CONVERT) - need - args->total) <
+	     (int)args->minleft)) {
+		xfs_trans_brelse(tp, agbp);
+		args->agbp = NULL;
+		return 0;
+	}
+	/*
+	 * Make the freelist shorter if it's too long.
+	 */
+	while (INT_GET(agf->agf_flcount, ARCH_CONVERT) > need) {
+		xfs_buf_t	*bp;
+
+		if ((error = xfs_alloc_get_freelist(tp, agbp, &bno)))
+			return error;
+		if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1)))
+			return error;
+		bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
+		xfs_trans_binval(tp, bp);
+	}
+	/*
+	 * Initialize the args structure.
+	 */
+	targs.tp = tp;
+	targs.mp = mp;
+	targs.agbp = agbp;
+	targs.agno = args->agno;
+	targs.mod = targs.minleft = targs.wasdel = targs.userdata =
+		targs.minalignslop = 0;
+	targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
+	targs.type = XFS_ALLOCTYPE_THIS_AG;
+	targs.pag = pag;
+	if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp)))
+		return error;
+	/*
+	 * Make the freelist longer if it's too short.
+	 */
+	while (INT_GET(agf->agf_flcount, ARCH_CONVERT) < need) {
+		targs.agbno = 0;
+		targs.maxlen = need - INT_GET(agf->agf_flcount, ARCH_CONVERT);
+		/*
+		 * Allocate as many blocks as possible at once.
+		 */
+		if ((error = xfs_alloc_ag_vextent(&targs)))
+			return error;
+		/*
+		 * Stop if we run out.	Won't happen if callers are obeying
+		 * the restrictions correctly.	Can happen for free calls
+		 * on a completely full ag.
+		 */
+		if (targs.agbno == NULLAGBLOCK)
+			break;
+		/*
+		 * Put each allocated block on the list.
+		 */
+		for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
+			if ((error = xfs_alloc_put_freelist(tp, agbp, agflbp,
+					bno)))
+				return error;
+		}
+	}
+	args->agbp = agbp;
+	return 0;
+}
+
+/*
+ * Get a block from the freelist.
+ * Returns with the buffer for the block gotten.
+ */
+int				/* error */
+xfs_alloc_get_freelist(
+	xfs_trans_t	*tp,	/* transaction pointer */
+	xfs_buf_t	*agbp,	/* buffer containing the agf structure */
+	xfs_agblock_t	*bnop)	/* block address retrieved from freelist */
+{
+	xfs_agf_t	*agf;	/* a.g. freespace structure */
+	xfs_agfl_t	*agfl;	/* a.g. freelist structure */
+	xfs_buf_t	*agflbp;/* buffer for a.g. freelist structure */
+	xfs_agblock_t	bno;	/* block number returned */
+	int		error;
+#ifdef XFS_ALLOC_TRACE
+	static char	fname[] = "xfs_alloc_get_freelist";
+#endif
+	xfs_mount_t	*mp;	/* mount structure */
+	xfs_perag_t	*pag;	/* per allocation group data */
+
+	agf = XFS_BUF_TO_AGF(agbp);
+	/*
+	 * Freelist is empty, give up.
+	 */
+	if (INT_ISZERO(agf->agf_flcount, ARCH_CONVERT)) {
+		*bnop = NULLAGBLOCK;
+		return 0;
+	}
+	/*
+	 * Read the array of free blocks.
+	 */
+	mp = tp->t_mountp;
+	if ((error = xfs_alloc_read_agfl(mp, tp,
+			INT_GET(agf->agf_seqno, ARCH_CONVERT), &agflbp)))
+		return error;
+	agfl = XFS_BUF_TO_AGFL(agflbp);
+	/*
+	 * Get the block number and update the data structures.
+	 */
+	bno = INT_GET(agfl->agfl_bno[INT_GET(agf->agf_flfirst, ARCH_CONVERT)], ARCH_CONVERT);
+	INT_MOD(agf->agf_flfirst, ARCH_CONVERT, 1);
+	xfs_trans_brelse(tp, agflbp);
+	if (INT_GET(agf->agf_flfirst, ARCH_CONVERT) == XFS_AGFL_SIZE)
+		INT_ZERO(agf->agf_flfirst, ARCH_CONVERT);
+	pag = &mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)];
+	INT_MOD(agf->agf_flcount, ARCH_CONVERT, -1);
+	xfs_trans_agflist_delta(tp, -1);
+	pag->pagf_flcount--;
+	TRACE_MODAGF(NULL, agf, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT);
+	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT);
+	*bnop = bno;
+
+	/*
+	 * As blocks are freed, they are added to the per-ag busy list
+	 * and remain there until the freeing transaction is committed to
+	 * disk.  Now that we have allocated blocks, this list must be
+	 * searched to see if a block is being reused.	If one is, then
+	 * the freeing transaction must be pushed to disk NOW by forcing
+	 * to disk all iclogs up that transaction's LSN.
+	 */
+	xfs_alloc_search_busy(tp, INT_GET(agf->agf_seqno, ARCH_CONVERT), bno, 1);
+	return 0;
+}
+
+/*
+ * Log the given fields from the agf structure.
+ */
+void
+xfs_alloc_log_agf(
+	xfs_trans_t	*tp,	/* transaction pointer */
+	xfs_buf_t	*bp,	/* buffer for a.g. freelist header */
+	int		fields) /* mask of fields to be logged (XFS_AGF_...) */
+{
+	int	first;		/* first byte offset */
+	int	last;		/* last byte offset */
+	static const short	offsets[] = {
+		offsetof(xfs_agf_t, agf_magicnum),
+		offsetof(xfs_agf_t, agf_versionnum),
+		offsetof(xfs_agf_t, agf_seqno),
+		offsetof(xfs_agf_t, agf_length),
+		offsetof(xfs_agf_t, agf_roots[0]),
+		offsetof(xfs_agf_t, agf_levels[0]),
+		offsetof(xfs_agf_t, agf_flfirst),
+		offsetof(xfs_agf_t, agf_fllast),
+		offsetof(xfs_agf_t, agf_flcount),
+		offsetof(xfs_agf_t, agf_freeblks),
+		offsetof(xfs_agf_t, agf_longest),
+		sizeof(xfs_agf_t)
+	};
+
+	xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
+	xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
+}
+
+/*
+ * Interface for inode allocation to force the pag data to be initialized.
+ */
+int					/* error */
+xfs_alloc_pagf_init(
+	xfs_mount_t		*mp,	/* file system mount structure */
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	int			flags)	/* XFS_ALLOC_FLAGS_... */
+{
+	xfs_buf_t		*bp;
+	int			error;
+
+	if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp)))
+		return error;
+	if (bp)
+		xfs_trans_brelse(tp, bp);
+	return 0;
+}
+
+/*
+ * Put the block on the freelist for the allocation group.
+ */
+int					/* error */
+xfs_alloc_put_freelist(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_buf_t		*agbp,	/* buffer for a.g. freelist header */
+	xfs_buf_t		*agflbp,/* buffer for a.g. free block array */
+	xfs_agblock_t		bno)	/* block being freed */
+{
+	xfs_agf_t		*agf;	/* a.g. freespace structure */
+	xfs_agfl_t		*agfl;	/* a.g. free block array */
+	xfs_agblock_t		*blockp;/* pointer to array entry */
+	int			error;
+#ifdef XFS_ALLOC_TRACE
+	static char		fname[] = "xfs_alloc_put_freelist";
+#endif
+	xfs_mount_t		*mp;	/* mount structure */
+	xfs_perag_t		*pag;	/* per allocation group data */
+
+	agf = XFS_BUF_TO_AGF(agbp);
+	mp = tp->t_mountp;
+
+	if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
+			INT_GET(agf->agf_seqno, ARCH_CONVERT), &agflbp)))
+		return error;
+	agfl = XFS_BUF_TO_AGFL(agflbp);
+	INT_MOD(agf->agf_fllast, ARCH_CONVERT, 1);
+	if (INT_GET(agf->agf_fllast, ARCH_CONVERT) == XFS_AGFL_SIZE)
+		INT_ZERO(agf->agf_fllast, ARCH_CONVERT);
+	pag = &mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)];
+	INT_MOD(agf->agf_flcount, ARCH_CONVERT, 1);
+	xfs_trans_agflist_delta(tp, 1);
+	pag->pagf_flcount++;
+	ASSERT(INT_GET(agf->agf_flcount, ARCH_CONVERT) <= XFS_AGFL_SIZE);
+	blockp = &agfl->agfl_bno[INT_GET(agf->agf_fllast, ARCH_CONVERT)];
+	INT_SET(*blockp, ARCH_CONVERT, bno);
+	TRACE_MODAGF(NULL, agf, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
+	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
+	xfs_trans_log_buf(tp, agflbp,
+		(int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl),
+		(int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl +
+			sizeof(xfs_agblock_t) - 1));
+	/*
+	 * Since blocks move to the free list without the coordination
+	 * used in xfs_bmap_finish, we can't allow block to be available
+	 * for reallocation and non-transaction writing (user data)
+	 * until we know that the transaction that moved it to the free
+	 * list is permanently on disk.	 We track the blocks by declaring
+	 * these blocks as "busy"; the busy list is maintained on a per-ag
+	 * basis and each transaction records which entries should be removed
+	 * when the iclog commits to disk.  If a busy block is allocated,
+	 * the iclog is pushed up to the LSN that freed the block.
+	 */
+	xfs_alloc_mark_busy(tp, INT_GET(agf->agf_seqno, ARCH_CONVERT), bno, 1);
+
+	return 0;
+}
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int					/* error */
+xfs_alloc_read_agf(
+	xfs_mount_t	*mp,		/* mount point structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	int		flags,		/* XFS_ALLOC_FLAG_... */
+	xfs_buf_t	**bpp)		/* buffer for the ag freelist header */
+{
+	xfs_agf_t	*agf;		/* ag freelist header */
+	int		agf_ok;		/* set if agf is consistent */
+	xfs_buf_t	*bp;		/* return value */
+	xfs_daddr_t	d;		/* disk block address */
+	int		error;
+	xfs_perag_t	*pag;		/* per allocation group data */
+
+	ASSERT(agno != NULLAGNUMBER);
+	d = XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR);
+	if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, 1,
+			(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0U,
+			&bp)))
+		return error;
+	ASSERT(!bp || !XFS_BUF_GETERROR(bp));
+	if (!bp) {
+		*bpp = NULL;
+		return 0;
+	}
+	/*
+	 * Validate the magic number of the agf block.
+	 */
+	agf = XFS_BUF_TO_AGF(bp);
+	agf_ok =
+		INT_GET(agf->agf_magicnum, ARCH_CONVERT) == XFS_AGF_MAGIC &&
+		XFS_AGF_GOOD_VERSION(INT_GET(agf->agf_versionnum, ARCH_CONVERT)) &&
+		INT_GET(agf->agf_freeblks, ARCH_CONVERT) <=
+				INT_GET(agf->agf_length, ARCH_CONVERT) &&
+		INT_GET(agf->agf_flfirst, ARCH_CONVERT) < XFS_AGFL_SIZE &&
+		INT_GET(agf->agf_fllast,  ARCH_CONVERT) < XFS_AGFL_SIZE &&
+		INT_GET(agf->agf_flcount, ARCH_CONVERT) <= XFS_AGFL_SIZE;
+	if (XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
+			XFS_RANDOM_ALLOC_READ_AGF)) {
+		xfs_trans_brelse(tp, bp);
+#ifdef __KERNEL__	/* additional, temporary, debugging code */
+		cmn_err(CE_NOTE,
+			"xfs_alloc_read_agf: error in <%s> AG %d",
+			mp->m_fsname, agno);
+		if (INT_GET(agf->agf_magicnum, ARCH_CONVERT) != XFS_AGF_MAGIC)
+			cmn_err(CE_NOTE, "bad agf_magicnum 0x%x",
+				INT_GET(agf->agf_magicnum, ARCH_CONVERT));
+		if (!XFS_AGF_GOOD_VERSION(INT_GET(agf->agf_versionnum, ARCH_CONVERT)))
+			cmn_err(CE_NOTE, "Bad version number 0x%x",
+				INT_GET(agf->agf_versionnum, ARCH_CONVERT));
+		if (!(INT_GET(agf->agf_freeblks, ARCH_CONVERT) <=
+				INT_GET(agf->agf_length, ARCH_CONVERT)))
+			cmn_err(CE_NOTE, "Bad freeblks %d %d",
+				INT_GET(agf->agf_freeblks, ARCH_CONVERT),
+				INT_GET(agf->agf_length, ARCH_CONVERT));
+		if (!(INT_GET(agf->agf_flfirst, ARCH_CONVERT) < XFS_AGFL_SIZE))
+			cmn_err(CE_NOTE, "Bad flfirst %d",
+				INT_GET(agf->agf_flfirst, ARCH_CONVERT));
+		if (!(INT_GET(agf->agf_fllast, ARCH_CONVERT) < XFS_AGFL_SIZE))
+			cmn_err(CE_NOTE, "Bad fllast %d",
+				INT_GET(agf->agf_fllast, ARCH_CONVERT));
+		if (!(INT_GET(agf->agf_flcount, ARCH_CONVERT) <= XFS_AGFL_SIZE))
+			cmn_err(CE_NOTE, "Bad flcount %d",
+				INT_GET(agf->agf_flcount, ARCH_CONVERT));
+#endif
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	pag = &mp->m_perag[agno];
+	if (!pag->pagf_init) {
+		pag->pagf_freeblks = INT_GET(agf->agf_freeblks, ARCH_CONVERT);
+		pag->pagf_flcount = INT_GET(agf->agf_flcount, ARCH_CONVERT);
+		pag->pagf_longest = INT_GET(agf->agf_longest, ARCH_CONVERT);
+		pag->pagf_levels[XFS_BTNUM_BNOi] =
+			INT_GET(agf->agf_levels[XFS_BTNUM_BNOi], ARCH_CONVERT);
+		pag->pagf_levels[XFS_BTNUM_CNTi] =
+			INT_GET(agf->agf_levels[XFS_BTNUM_CNTi], ARCH_CONVERT);
+		spinlock_init(&pag->pagb_lock, "xfspagb");
+		pag->pagb_list = kmem_zalloc(XFS_PAGB_NUM_SLOTS *
+					sizeof(xfs_perag_busy_t), KM_SLEEP);
+		pag->pagf_init = 1;
+	}
+#ifdef DEBUG
+	else if (!XFS_FORCED_SHUTDOWN(mp)) {
+		ASSERT(pag->pagf_freeblks == INT_GET(agf->agf_freeblks, ARCH_CONVERT));
+		ASSERT(pag->pagf_flcount == INT_GET(agf->agf_flcount, ARCH_CONVERT));
+		ASSERT(pag->pagf_longest == INT_GET(agf->agf_longest, ARCH_CONVERT));
+		ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
+		       INT_GET(agf->agf_levels[XFS_BTNUM_BNOi], ARCH_CONVERT));
+		ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
+		       INT_GET(agf->agf_levels[XFS_BTNUM_CNTi], ARCH_CONVERT));
+	}
+#endif
+	XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGF, XFS_AGF_REF);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Allocate an extent (variable-size).
+ * Depending on the allocation type, we either look in a single allocation
+ * group or loop over the allocation groups to find the result.
+ */
+int				/* error */
+xfs_alloc_vextent(
+	xfs_alloc_arg_t *args)	/* allocation argument structure */
+{
+	xfs_agblock_t	agsize; /* allocation group size */
+	int		error;
+	int		flags;	/* XFS_ALLOC_FLAG_... locking flags */
+#ifdef XFS_ALLOC_TRACE
+	static char	fname[] = "xfs_alloc_vextent";
+#endif
+	xfs_extlen_t	minleft;/* minimum left value, temp copy */
+	xfs_mount_t	*mp;	/* mount structure pointer */
+	xfs_agnumber_t	sagno;	/* starting allocation group number */
+	xfs_alloctype_t type;	/* input allocation type */
+	int		bump_rotor = 0;
+	int		no_min = 0;
+
+	mp = args->mp;
+	type = args->otype = args->type;
+	args->agbno = NULLAGBLOCK;
+	/*
+	 * Just fix this up, for the case where the last a.g. is shorter
+	 * (or there's only one a.g.) and the caller couldn't easily figure
+	 * that out (xfs_bmap_alloc).
+	 */
+	agsize = mp->m_sb.sb_agblocks;
+	if (args->maxlen > agsize)
+		args->maxlen = agsize;
+	if (args->alignment == 0)
+		args->alignment = 1;
+	ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
+	ASSERT(args->minlen <= args->maxlen);
+	ASSERT(args->minlen <= agsize);
+	ASSERT(args->mod < args->prod);
+	if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
+	    XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
+	    args->minlen > args->maxlen || args->minlen > agsize ||
+	    args->mod >= args->prod) {
+		args->fsbno = NULLFSBLOCK;
+		TRACE_ALLOC("badargs", args);
+		return 0;
+	}
+	minleft = args->minleft;
+
+	switch (type) {
+	case XFS_ALLOCTYPE_THIS_AG:
+	case XFS_ALLOCTYPE_NEAR_BNO:
+	case XFS_ALLOCTYPE_THIS_BNO:
+		/*
+		 * These three force us into a single a.g.
+		 */
+		args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+		down_read(&mp->m_peraglock);
+		args->pag = &mp->m_perag[args->agno];
+		args->minleft = 0;
+		error = xfs_alloc_fix_freelist(args, 0);
+		args->minleft = minleft;
+		if (error) {
+			TRACE_ALLOC("nofix", args);
+			goto error0;
+		}
+		if (!args->agbp) {
+			up_read(&mp->m_peraglock);
+			TRACE_ALLOC("noagbp", args);
+			break;
+		}
+		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+		if ((error = xfs_alloc_ag_vextent(args)))
+			goto error0;
+		up_read(&mp->m_peraglock);
+		break;
+	case XFS_ALLOCTYPE_START_BNO:
+		/*
+		 * Try near allocation first, then anywhere-in-ag after
+		 * the first a.g. fails.
+		 */
+		if ((args->userdata  == XFS_ALLOC_INITIAL_USER_DATA) &&
+		    (mp->m_flags & XFS_MOUNT_32BITINODES)) {
+			args->fsbno = XFS_AGB_TO_FSB(mp, mp->m_agfrotor, 0);
+			bump_rotor = 1;
+		}
+		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+		args->type = XFS_ALLOCTYPE_NEAR_BNO;
+		/* FALLTHROUGH */
+	case XFS_ALLOCTYPE_ANY_AG:
+	case XFS_ALLOCTYPE_START_AG:
+	case XFS_ALLOCTYPE_FIRST_AG:
+		/*
+		 * Rotate through the allocation groups looking for a winner.
+		 */
+		if (type == XFS_ALLOCTYPE_ANY_AG) {
+			/*
+			 * Start with the last place we left off.
+			 */
+			args->agno = sagno = mp->m_agfrotor;
+			args->type = XFS_ALLOCTYPE_THIS_AG;
+			flags = XFS_ALLOC_FLAG_TRYLOCK;
+		} else if (type == XFS_ALLOCTYPE_FIRST_AG) {
+			/*
+			 * Start with allocation group given by bno.
+			 */
+			args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+			args->type = XFS_ALLOCTYPE_THIS_AG;
+			sagno = 0;
+			flags = 0;
+		} else {
+			if (type == XFS_ALLOCTYPE_START_AG)
+				args->type = XFS_ALLOCTYPE_THIS_AG;
+			/*
+			 * Start with the given allocation group.
+			 */
+			args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+			flags = XFS_ALLOC_FLAG_TRYLOCK;
+		}
+		/*
+		 * Loop over allocation groups twice; first time with
+		 * trylock set, second time without.
+		 */
+		down_read(&mp->m_peraglock);
+		for (;;) {
+			args->pag = &mp->m_perag[args->agno];
+			if (no_min) args->minleft = 0;
+			error = xfs_alloc_fix_freelist(args, flags);
+			args->minleft = minleft;
+			if (error) {
+				TRACE_ALLOC("nofix", args);
+				goto error0;
+			}
+			/*
+			 * If we get a buffer back then the allocation will fly.
+			 */
+			if (args->agbp) {
+				if ((error = xfs_alloc_ag_vextent(args)))
+					goto error0;
+				break;
+			}
+			TRACE_ALLOC("loopfailed", args);
+			/*
+			 * Didn't work, figure out the next iteration.
+			 */
+			if (args->agno == sagno &&
+			    type == XFS_ALLOCTYPE_START_BNO)
+				args->type = XFS_ALLOCTYPE_THIS_AG;
+			if (++(args->agno) == mp->m_sb.sb_agcount)
+				args->agno = 0;
+			/*
+			 * Reached the starting a.g., must either be done
+			 * or switch to non-trylock mode.
+			 */
+			if (args->agno == sagno) {
+				if (no_min == 1) {
+					args->agbno = NULLAGBLOCK;
+					TRACE_ALLOC("allfailed", args);
+					break;
+				}
+				if (flags == 0) {
+					no_min = 1;
+				} else {
+					flags = 0;
+					if (type == XFS_ALLOCTYPE_START_BNO) {
+						args->agbno = XFS_FSB_TO_AGBNO(mp,
+							args->fsbno);
+						args->type = XFS_ALLOCTYPE_NEAR_BNO;
+					}
+				}
+			}
+		}
+		up_read(&mp->m_peraglock);
+		if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG))
+			mp->m_agfrotor = (args->agno + 1) % mp->m_sb.sb_agcount;
+		break;
+	default:
+		ASSERT(0);
+		/* NOTREACHED */
+	}
+	if (args->agbno == NULLAGBLOCK)
+		args->fsbno = NULLFSBLOCK;
+	else {
+		args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
+#ifdef DEBUG
+		ASSERT(args->len >= args->minlen);
+		ASSERT(args->len <= args->maxlen);
+		ASSERT(args->agbno % args->alignment == 0);
+		XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
+			args->len);
+#endif
+	}
+	return 0;
+error0:
+	up_read(&mp->m_peraglock);
+	return error;
+}
+
+/*
+ * Free an extent.
+ * Just break up the extent address and hand off to xfs_free_ag_extent
+ * after fixing up the freelist.
+ */
+int				/* error */
+xfs_free_extent(
+	xfs_trans_t	*tp,	/* transaction pointer */
+	xfs_fsblock_t	bno,	/* starting block number of extent */
+	xfs_extlen_t	len)	/* length of extent */
+{
+#ifdef DEBUG
+	xfs_agf_t	*agf;	/* a.g. freespace header */
+#endif
+	xfs_alloc_arg_t args;	/* allocation argument structure */
+	int		error;
+
+	ASSERT(len != 0);
+	args.tp = tp;
+	args.mp = tp->t_mountp;
+	args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
+	ASSERT(args.agno < args.mp->m_sb.sb_agcount);
+	args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
+	args.alignment = 1;
+	args.minlen = args.minleft = args.minalignslop = 0;
+	down_read(&args.mp->m_peraglock);
+	args.pag = &args.mp->m_perag[args.agno];
+	if ((error = xfs_alloc_fix_freelist(&args, 0)))
+		goto error0;
+#ifdef DEBUG
+	ASSERT(args.agbp != NULL);
+	agf = XFS_BUF_TO_AGF(args.agbp);
+	ASSERT(args.agbno + len <= INT_GET(agf->agf_length, ARCH_CONVERT));
+#endif
+	error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno,
+		len, 0);
+error0:
+	up_read(&args.mp->m_peraglock);
+	return error;
+}
+
+
+/*
+ * AG Busy list management
+ * The busy list contains block ranges that have been freed but whose
+ * transacations have not yet hit disk.	 If any block listed in a busy
+ * list is reused, the transaction that freed it must be forced to disk
+ * before continuing to use the block.
+ *
+ * xfs_alloc_mark_busy - add to the per-ag busy list
+ * xfs_alloc_clear_busy - remove an item from the per-ag busy list
+ */
+void
+xfs_alloc_mark_busy(xfs_trans_t *tp,
+		    xfs_agnumber_t agno,
+		    xfs_agblock_t bno,
+		    xfs_extlen_t len)
+{
+	xfs_mount_t		*mp;
+	xfs_perag_busy_t	*bsy;
+	int			n;
+	SPLDECL(s);
+
+	mp = tp->t_mountp;
+	s = mutex_spinlock(&mp->m_perag[agno].pagb_lock);
+
+	/* search pagb_list for an open slot */
+	for (bsy = mp->m_perag[agno].pagb_list, n = 0;
+	     n < XFS_PAGB_NUM_SLOTS;
+	     bsy++, n++) {
+		if (bsy->busy_tp == NULL) {
+			break;
+		}
+	}
+
+	if (n < XFS_PAGB_NUM_SLOTS) {
+		bsy = &mp->m_perag[agno].pagb_list[n];
+		mp->m_perag[agno].pagb_count++;
+		TRACE_BUSY("xfs_alloc_mark_busy", "got", agno, bno, len, n, tp);
+		bsy->busy_start = bno;
+		bsy->busy_length = len;
+		bsy->busy_tp = tp;
+		xfs_trans_add_busy(tp, agno, n);
+	} else {
+		TRACE_BUSY("xfs_alloc_mark_busy", "FULL", agno, bno, len, -1, tp);
+		/*
+		 * The busy list is full!  Since it is now not possible to
+		 * track the free block, make this a synchronous transaction
+		 * to insure that the block is not reused before this
+		 * transaction commits.
+		 */
+		xfs_trans_set_sync(tp);
+	}
+
+	mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
+}
+
+void
+xfs_alloc_clear_busy(xfs_trans_t *tp,
+		     xfs_agnumber_t agno,
+		     int idx)
+{
+	xfs_mount_t		*mp;
+	xfs_perag_busy_t	*list;
+	SPLDECL(s);
+
+	mp = tp->t_mountp;
+
+	s = mutex_spinlock(&mp->m_perag[agno].pagb_lock);
+	list = mp->m_perag[agno].pagb_list;
+
+	ASSERT(idx < XFS_PAGB_NUM_SLOTS);
+	if (list[idx].busy_tp == tp) {
+		TRACE_UNBUSY("xfs_alloc_clear_busy", "found", agno, idx, tp);
+		list[idx].busy_tp = NULL;
+		mp->m_perag[agno].pagb_count--;
+	} else {
+		TRACE_UNBUSY("xfs_alloc_clear_busy", "missing", agno, idx, tp);
+	}
+
+	mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
+}
+
+
+/*
+ * returns non-zero if any of (agno,bno):len is in a busy list
+ */
+int
+xfs_alloc_search_busy(xfs_trans_t *tp,
+		    xfs_agnumber_t agno,
+		    xfs_agblock_t bno,
+		    xfs_extlen_t len)
+{
+	xfs_mount_t		*mp;
+	xfs_perag_busy_t	*bsy;
+	int			n;
+	xfs_agblock_t		uend, bend;
+	xfs_lsn_t		lsn;
+	int			cnt;
+	SPLDECL(s);
+
+	mp = tp->t_mountp;
+
+	s = mutex_spinlock(&mp->m_perag[agno].pagb_lock);
+	cnt = mp->m_perag[agno].pagb_count;
+
+	uend = bno + len;
+
+	/* search pagb_list for this slot, skipping open slots */
+	for (bsy = mp->m_perag[agno].pagb_list, n = 0;
+	     cnt; bsy++, n++) {
+
+		/*
+		 * (start1,length1) within (start2, length2)
+		 */
+		if (bsy->busy_tp != NULL) {
+			bend = bsy->busy_start + bsy->busy_length;
+			if ( (bno >= bsy->busy_start && bno <= bend) ||
+			     (uend >= bsy->busy_start && uend <= bend) ||
+			     (bno <= bsy->busy_start && uend >= bsy->busy_start) ) {
+				TRACE_BUSYSEARCH("xfs_alloc_search_busy",
+						 "found1", agno, bno, len, n,
+						 tp);
+				break;
+			}
+			cnt--;
+		}
+	}
+
+	/*
+	 * If a block was found, force the log through the LSN of the
+	 * transaction that freed the block
+	 */
+	if (cnt) {
+		TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, n, tp);
+		lsn = bsy->busy_tp->t_lsn;
+		mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
+		xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
+	} else {
+		TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, n, tp);
+		n = -1;
+		mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s);
+	}
+
+	return n;
+}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
new file mode 100644
index 000000000000..2ef859cb41e1
--- /dev/null
+++ b/fs/xfs/xfs_alloc.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ALLOC_H__
+#define __XFS_ALLOC_H__
+
+struct xfs_buf;
+struct xfs_mount;
+struct xfs_perag;
+struct xfs_trans;
+
+/*
+ * Freespace allocation types.	Argument to xfs_alloc_[v]extent.
+ */
+typedef enum xfs_alloctype
+{
+	XFS_ALLOCTYPE_ANY_AG,		/* allocate anywhere, use rotor */
+	XFS_ALLOCTYPE_FIRST_AG,		/* ... start at ag 0 */
+	XFS_ALLOCTYPE_START_AG,		/* anywhere, start in this a.g. */
+	XFS_ALLOCTYPE_THIS_AG,		/* anywhere in this a.g. */
+	XFS_ALLOCTYPE_START_BNO,	/* near this block else anywhere */
+	XFS_ALLOCTYPE_NEAR_BNO,		/* in this a.g. and near this block */
+	XFS_ALLOCTYPE_THIS_BNO		/* at exactly this block */
+} xfs_alloctype_t;
+
+/*
+ * Flags for xfs_alloc_fix_freelist.
+ */
+#define XFS_ALLOC_FLAG_TRYLOCK	0x00000001  /* use trylock for buffer locking */
+
+/*
+ * Argument structure for xfs_alloc routines.
+ * This is turned into a structure to avoid having 20 arguments passed
+ * down several levels of the stack.
+ */
+typedef struct xfs_alloc_arg {
+	struct xfs_trans *tp;		/* transaction pointer */
+	struct xfs_mount *mp;		/* file system mount point */
+	struct xfs_buf	*agbp;		/* buffer for a.g. freelist header */
+	struct xfs_perag *pag;		/* per-ag struct for this agno */
+	xfs_fsblock_t	fsbno;		/* file system block number */
+	xfs_agnumber_t	agno;		/* allocation group number */
+	xfs_agblock_t	agbno;		/* allocation group-relative block # */
+	xfs_extlen_t	minlen;		/* minimum size of extent */
+	xfs_extlen_t	maxlen;		/* maximum size of extent */
+	xfs_extlen_t	mod;		/* mod value for extent size */
+	xfs_extlen_t	prod;		/* prod value for extent size */
+	xfs_extlen_t	minleft;	/* min blocks must be left after us */
+	xfs_extlen_t	total;		/* total blocks needed in xaction */
+	xfs_extlen_t	alignment;	/* align answer to multiple of this */
+	xfs_extlen_t	minalignslop;	/* slop for minlen+alignment calcs */
+	xfs_extlen_t	len;		/* output: actual size of extent */
+	xfs_alloctype_t type;		/* allocation type XFS_ALLOCTYPE_... */
+	xfs_alloctype_t otype;		/* original allocation type */
+	char		wasdel;		/* set if allocation was prev delayed */
+	char		wasfromfl;	/* set if allocation is from freelist */
+	char		isfl;		/* set if is freelist blocks - !actg */
+	char		userdata;	/* set if this is user data */
+} xfs_alloc_arg_t;
+
+/*
+ * Defines for userdata
+ */
+#define XFS_ALLOC_USERDATA		1	/* allocation is for user data*/
+#define XFS_ALLOC_INITIAL_USER_DATA	2	/* special case start of file */
+
+
+#ifdef __KERNEL__
+
+/*
+ * Types for alloc tracing.
+ */
+#define XFS_ALLOC_KTRACE_ALLOC	1
+#define XFS_ALLOC_KTRACE_FREE	2
+#define XFS_ALLOC_KTRACE_MODAGF 3
+#define XFS_ALLOC_KTRACE_BUSY	4
+#define XFS_ALLOC_KTRACE_UNBUSY 5
+#define XFS_ALLOC_KTRACE_BUSYSEARCH	6
+
+
+/*
+ * Allocation tracing buffer size.
+ */
+#define XFS_ALLOC_TRACE_SIZE	4096
+
+#ifdef	XFS_ALL_TRACE
+#define XFS_ALLOC_TRACE
+#endif
+
+#if !defined(DEBUG)
+#undef	XFS_ALLOC_TRACE
+#endif
+
+/*
+ * Prototypes for visible xfs_alloc.c routines
+ */
+
+/*
+ * Compute and fill in value of m_ag_maxlevels.
+ */
+void
+xfs_alloc_compute_maxlevels(
+	struct xfs_mount	*mp);	/* file system mount structure */
+
+/*
+ * Decide whether to use this allocation group for this allocation.
+ * If so, fix up the btree freelist's size.
+ * This is external so mkfs can call it, too.
+ */
+int				/* error */
+xfs_alloc_fix_freelist(
+	xfs_alloc_arg_t *args,	/* allocation argument structure */
+	int		flags); /* XFS_ALLOC_FLAG_... */
+
+/*
+ * Get a block from the freelist.
+ * Returns with the buffer for the block gotten.
+ */
+int				/* error */
+xfs_alloc_get_freelist(
+	struct xfs_trans *tp,	/* transaction pointer */
+	struct xfs_buf	*agbp,	/* buffer containing the agf structure */
+	xfs_agblock_t	*bnop); /* block address retrieved from freelist */
+
+/*
+ * Log the given fields from the agf structure.
+ */
+void
+xfs_alloc_log_agf(
+	struct xfs_trans *tp,	/* transaction pointer */
+	struct xfs_buf	*bp,	/* buffer for a.g. freelist header */
+	int		fields);/* mask of fields to be logged (XFS_AGF_...) */
+
+/*
+ * Interface for inode allocation to force the pag data to be initialized.
+ */
+int				/* error */
+xfs_alloc_pagf_init(
+	struct xfs_mount *mp,	/* file system mount structure */
+	struct xfs_trans *tp,	/* transaction pointer */
+	xfs_agnumber_t	agno,	/* allocation group number */
+	int		flags); /* XFS_ALLOC_FLAGS_... */
+
+/*
+ * Put the block on the freelist for the allocation group.
+ */
+int				/* error */
+xfs_alloc_put_freelist(
+	struct xfs_trans *tp,	/* transaction pointer */
+	struct xfs_buf	*agbp,	/* buffer for a.g. freelist header */
+	struct xfs_buf	*agflbp,/* buffer for a.g. free block array */
+	xfs_agblock_t	bno);	/* block being freed */
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int					/* error  */
+xfs_alloc_read_agf(
+	struct xfs_mount *mp,		/* mount point structure */
+	struct xfs_trans *tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	int		flags,		/* XFS_ALLOC_FLAG_... */
+	struct xfs_buf	**bpp);		/* buffer for the ag freelist header */
+
+/*
+ * Allocate an extent (variable-size).
+ */
+int				/* error */
+xfs_alloc_vextent(
+	xfs_alloc_arg_t *args); /* allocation argument structure */
+
+/*
+ * Free an extent.
+ */
+int				/* error */
+xfs_free_extent(
+	struct xfs_trans *tp,	/* transaction pointer */
+	xfs_fsblock_t	bno,	/* starting block number of extent */
+	xfs_extlen_t	len);	/* length of extent */
+
+void
+xfs_alloc_mark_busy(xfs_trans_t *tp,
+		xfs_agnumber_t agno,
+		xfs_agblock_t bno,
+		xfs_extlen_t len);
+
+void
+xfs_alloc_clear_busy(xfs_trans_t *tp,
+		xfs_agnumber_t ag,
+		int idx);
+
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
new file mode 100644
index 000000000000..11c7618be7c7
--- /dev/null
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -0,0 +1,2155 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * Free space allocation for XFS.
+ */
+
+#include <xfs.h>
+
+/*
+ * Prototypes for internal functions.
+ */
+
+STATIC void xfs_alloc_log_block(xfs_trans_t *, xfs_buf_t *, int);
+STATIC void xfs_alloc_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC void xfs_alloc_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC void xfs_alloc_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC int xfs_alloc_lshift(xfs_btree_cur_t *, int, int *);
+STATIC int xfs_alloc_newroot(xfs_btree_cur_t *, int *);
+STATIC int xfs_alloc_rshift(xfs_btree_cur_t *, int, int *);
+STATIC int xfs_alloc_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
+		xfs_alloc_key_t *, xfs_btree_cur_t **, int *);
+STATIC int xfs_alloc_updkey(xfs_btree_cur_t *, xfs_alloc_key_t *, int);
+
+/*
+ * Internal functions.
+ */
+
+/*
+ * Single level of the xfs_alloc_delete record deletion routine.
+ * Delete record pointed to by cur/level.
+ * Remove the record from its block then rebalance the tree.
+ * Return 0 for error, 1 for done, 2 to go on to the next level.
+ */
+STATIC int				/* error */
+xfs_alloc_delrec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level removing record from */
+	int			*stat)	/* fail/done/go-on */
+{
+	xfs_agf_t		*agf;	/* allocation group freelist header */
+	xfs_alloc_block_t	*block; /* btree block record/key lives in */
+	xfs_agblock_t		bno;	/* btree block number */
+	xfs_buf_t		*bp;	/* buffer for block */
+	int			error;	/* error return value */
+	int			i;	/* loop index */
+	xfs_alloc_key_t		key;	/* kp points here if block is level 0 */
+	xfs_agblock_t		lbno;	/* left block's block number */
+	xfs_buf_t		*lbp;	/* left block's buffer pointer */
+	xfs_alloc_block_t	*left;	/* left btree block */
+	xfs_alloc_key_t		*lkp=NULL;	/* left block key pointer */
+	xfs_alloc_ptr_t		*lpp=NULL;	/* left block address pointer */
+	int			lrecs=0;	/* number of records in left block */
+	xfs_alloc_rec_t		*lrp;	/* left block record pointer */
+	xfs_mount_t		*mp;	/* mount structure */
+	int			ptr;	/* index in btree block for this rec */
+	xfs_agblock_t		rbno;	/* right block's block number */
+	xfs_buf_t		*rbp;	/* right block's buffer pointer */
+	xfs_alloc_block_t	*right; /* right btree block */
+	xfs_alloc_key_t		*rkp;	/* right block key pointer */
+	xfs_alloc_ptr_t		*rpp;	/* right block address pointer */
+	int			rrecs=0;	/* number of records in right block */
+	xfs_alloc_rec_t		*rrp;	/* right block record pointer */
+	xfs_btree_cur_t		*tcur;	/* temporary btree cursor */
+
+	/*
+	 * Get the index of the entry being deleted, check for nothing there.
+	 */
+	ptr = cur->bc_ptrs[level];
+	if (ptr == 0) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * Get the buffer & block containing the record or key/ptr.
+	 */
+	bp = cur->bc_bufs[level];
+	block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+		return error;
+#endif
+	/*
+	 * Fail if we're off the end of the block.
+	 */
+	if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+		*stat = 0;
+		return 0;
+	}
+	XFS_STATS_INC(xfsstats.xs_abt_delrec);
+	/*
+	 * It's a nonleaf.  Excise the key and ptr being deleted, by
+	 * sliding the entries past them down one.
+	 * Log the changed areas of the block.
+	 */
+	if (level > 0) {
+		lkp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
+		lpp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
+#ifdef DEBUG
+		for (i = ptr; i < INT_GET(block->bb_numrecs, ARCH_CONVERT); i++) {
+			if ((error = xfs_btree_check_sptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level)))
+				return error;
+		}
+#endif
+		if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+			ovbcopy(&lkp[ptr], &lkp[ptr - 1],
+				(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*lkp)); /* INT_: mem copy */
+			ovbcopy(&lpp[ptr], &lpp[ptr - 1],
+				(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*lpp)); /* INT_: mem copy */
+			xfs_alloc_log_ptrs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1);
+			xfs_alloc_log_keys(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1);
+		}
+	}
+	/*
+	 * It's a leaf.	 Excise the record being deleted, by sliding the
+	 * entries past it down one.  Log the changed areas of the block.
+	 */
+	else {
+		lrp = XFS_ALLOC_REC_ADDR(block, 1, cur);
+		if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+			ovbcopy(&lrp[ptr], &lrp[ptr - 1],
+				(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*lrp));
+			xfs_alloc_log_recs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1);
+		}
+		/*
+		 * If it's the first record in the block, we'll need a key
+		 * structure to pass up to the next level (updkey).
+		 */
+		if (ptr == 1) {
+			key.ar_startblock = lrp->ar_startblock; /* INT_: direct copy */
+			key.ar_blockcount = lrp->ar_blockcount; /* INT_: direct copy */
+			lkp = &key;
+		}
+	}
+	/*
+	 * Decrement and log the number of entries in the block.
+	 */
+	INT_MOD(block->bb_numrecs, ARCH_CONVERT, -1);
+	xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
+	/*
+	 * See if the longest free extent in the allocation group was
+	 * changed by this operation.  True if it's the by-size btree, and
+	 * this is the leaf level, and there is no right sibling block,
+	 * and this was the last record.
+	 */
+	agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+	mp = cur->bc_mp;
+
+	if (level == 0 &&
+	    cur->bc_btnum == XFS_BTNUM_CNT &&
+	    INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK &&
+	    ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+		ASSERT(ptr == INT_GET(block->bb_numrecs, ARCH_CONVERT) + 1);
+		/*
+		 * There are still records in the block.  Grab the size
+		 * from the last one.
+		 */
+		if (INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+			rrp = XFS_ALLOC_REC_ADDR(block, INT_GET(block->bb_numrecs, ARCH_CONVERT), cur);
+			INT_COPY(agf->agf_longest, rrp->ar_blockcount, ARCH_CONVERT);
+		}
+		/*
+		 * No free extents left.
+		 */
+		else
+			INT_ZERO(agf->agf_longest, ARCH_CONVERT);
+		mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)].pagf_longest =
+			INT_GET(agf->agf_longest, ARCH_CONVERT);
+		xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
+			XFS_AGF_LONGEST);
+	}
+	/*
+	 * Is this the root level?  If so, we're almost done.
+	 */
+	if (level == cur->bc_nlevels - 1) {
+		/*
+		 * If this is the root level,
+		 * and there's only one entry left,
+		 * and it's NOT the leaf level,
+		 * then we can get rid of this level.
+		 */
+		if (INT_GET(block->bb_numrecs, ARCH_CONVERT) == 1 && level > 0) {
+			/*
+			 * lpp is still set to the first pointer in the block.
+			 * Make it the new root of the btree.
+			 */
+			bno = INT_GET(agf->agf_roots[cur->bc_btnum], ARCH_CONVERT);
+			INT_COPY(agf->agf_roots[cur->bc_btnum], *lpp, ARCH_CONVERT);
+			INT_MOD(agf->agf_levels[cur->bc_btnum], ARCH_CONVERT, -1);
+			mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)].pagf_levels[cur->bc_btnum]--;
+			/*
+			 * Put this buffer/block on the ag's freelist.
+			 */
+			if ((error = xfs_alloc_put_freelist(cur->bc_tp,
+					cur->bc_private.a.agbp, NULL, bno)))
+				return error;
+			xfs_trans_agbtree_delta(cur->bc_tp, -1);
+			xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
+				XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+			/*
+			 * Update the cursor so there's one fewer level.
+			 */
+			xfs_btree_setbuf(cur, level, 0);
+			cur->bc_nlevels--;
+		} else if (level > 0 &&
+			   (error = xfs_alloc_decrement(cur, level, &i)))
+			return error;
+		*stat = 1;
+		return 0;
+	}
+	/*
+	 * If we deleted the leftmost entry in the block, update the
+	 * key values above us in the tree.
+	 */
+	if (ptr == 1 && (error = xfs_alloc_updkey(cur, lkp, level + 1)))
+		return error;
+	/*
+	 * If the number of records remaining in the block is at least
+	 * the minimum, we're done.
+	 */
+	if (INT_GET(block->bb_numrecs, ARCH_CONVERT) >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
+		if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
+			return error;
+		*stat = 1;
+		return 0;
+	}
+	/*
+	 * Otherwise, we have to move some records around to keep the
+	 * tree balanced.  Look at the left and right sibling blocks to
+	 * see if we can re-balance by moving only one record.
+	 */
+	rbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
+	lbno = INT_GET(block->bb_leftsib, ARCH_CONVERT);
+	bno = NULLAGBLOCK;
+	ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
+	/*
+	 * Duplicate the cursor so our btree manipulations here won't
+	 * disrupt the next level up.
+	 */
+	if ((error = xfs_btree_dup_cursor(cur, &tcur)))
+		return error;
+	/*
+	 * If there's a right sibling, see if it's ok to shift an entry
+	 * out of it.
+	 */
+	if (rbno != NULLAGBLOCK) {
+		/*
+		 * Move the temp cursor to the last entry in the next block.
+		 * Actually any entry but the first would suffice.
+		 */
+		i = xfs_btree_lastrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if ((error = xfs_alloc_increment(tcur, level, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		i = xfs_btree_lastrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Grab a pointer to the block.
+		 */
+		rbp = tcur->bc_bufs[level];
+		right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
+			goto error0;
+#endif
+		/*
+		 * Grab the current block number, for future use.
+		 */
+		bno = INT_GET(right->bb_leftsib, ARCH_CONVERT);
+		/*
+		 * If right block is full enough so that removing one entry
+		 * won't make it too empty, and left-shifting an entry out
+		 * of right to us works, we're done.
+		 */
+		if (INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1 >=
+		     XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
+			if ((error = xfs_alloc_lshift(tcur, level, &i)))
+				goto error0;
+			if (i) {
+				ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
+				       XFS_ALLOC_BLOCK_MINRECS(level, cur));
+				xfs_btree_del_cursor(tcur,
+						     XFS_BTREE_NOERROR);
+				if (level > 0 &&
+				    (error = xfs_alloc_decrement(cur, level,
+					    &i)))
+					return error;
+				*stat = 1;
+				return 0;
+			}
+		}
+		/*
+		 * Otherwise, grab the number of records in right for
+		 * future reference, and fix up the temp cursor to point
+		 * to our block again (last record).
+		 */
+		rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT);
+		if (lbno != NULLAGBLOCK) {
+			i = xfs_btree_firstrec(tcur, level);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			if ((error = xfs_alloc_decrement(tcur, level, &i)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		}
+	}
+	/*
+	 * If there's a left sibling, see if it's ok to shift an entry
+	 * out of it.
+	 */
+	if (lbno != NULLAGBLOCK) {
+		/*
+		 * Move the temp cursor to the first entry in the
+		 * previous block.
+		 */
+		i = xfs_btree_firstrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if ((error = xfs_alloc_decrement(tcur, level, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		xfs_btree_firstrec(tcur, level);
+		/*
+		 * Grab a pointer to the block.
+		 */
+		lbp = tcur->bc_bufs[level];
+		left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+			goto error0;
+#endif
+		/*
+		 * Grab the current block number, for future use.
+		 */
+		bno = INT_GET(left->bb_rightsib, ARCH_CONVERT);
+		/*
+		 * If left block is full enough so that removing one entry
+		 * won't make it too empty, and right-shifting an entry out
+		 * of left to us works, we're done.
+		 */
+		if (INT_GET(left->bb_numrecs, ARCH_CONVERT) - 1 >=
+		     XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
+			if ((error = xfs_alloc_rshift(tcur, level, &i)))
+				goto error0;
+			if (i) {
+				ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
+				       XFS_ALLOC_BLOCK_MINRECS(level, cur));
+				xfs_btree_del_cursor(tcur,
+						     XFS_BTREE_NOERROR);
+				if (level == 0)
+					cur->bc_ptrs[0]++;
+				*stat = 1;
+				return 0;
+			}
+		}
+		/*
+		 * Otherwise, grab the number of records in right for
+		 * future reference.
+		 */
+		lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
+	}
+	/*
+	 * Delete the temp cursor, we're done with it.
+	 */
+	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+	/*
+	 * If here, we need to do a join to keep the tree balanced.
+	 */
+	ASSERT(bno != NULLAGBLOCK);
+	/*
+	 * See if we can join with the left neighbor block.
+	 */
+	if (lbno != NULLAGBLOCK &&
+	    lrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
+		/*
+		 * Set "right" to be the starting block,
+		 * "left" to be the left neighbor.
+		 */
+		rbno = bno;
+		right = block;
+		rbp = bp;
+		if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+				cur->bc_private.a.agno, lbno, 0, &lbp,
+				XFS_ALLOC_BTREE_REF)))
+			return error;
+		left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+		if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+			return error;
+	}
+	/*
+	 * If that won't work, see if we can join with the right neighbor block.
+	 */
+	else if (rbno != NULLAGBLOCK &&
+		 rrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <=
+		  XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
+		/*
+		 * Set "left" to be the starting block,
+		 * "right" to be the right neighbor.
+		 */
+		lbno = bno;
+		left = block;
+		lbp = bp;
+		if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+				cur->bc_private.a.agno, rbno, 0, &rbp,
+				XFS_ALLOC_BTREE_REF)))
+			return error;
+		right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
+		if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
+			return error;
+	}
+	/*
+	 * Otherwise, we can't fix the imbalance.
+	 * Just return.	 This is probably a logic error, but it's not fatal.
+	 */
+	else {
+		if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
+			return error;
+		*stat = 1;
+		return 0;
+	}
+	/*
+	 * We're now going to join "left" and "right" by moving all the stuff
+	 * in "right" to "left" and deleting "right".
+	 */
+	if (level > 0) {
+		/*
+		 * It's a non-leaf.  Move keys and pointers.
+		 */
+		lkp = XFS_ALLOC_KEY_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur);
+		lpp = XFS_ALLOC_PTR_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur);
+		rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
+		rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+		for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
+			if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level)))
+				return error;
+		}
+#endif
+		bcopy(rkp, lkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lkp)); /* INT_: structure copy */
+		bcopy(rpp, lpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lpp)); /* INT_: structure copy */
+		xfs_alloc_log_keys(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1,
+				   INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		xfs_alloc_log_ptrs(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1,
+				   INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT));
+	} else {
+		/*
+		 * It's a leaf.	 Move records.
+		 */
+		lrp = XFS_ALLOC_REC_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur);
+		rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
+		bcopy(rrp, lrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lrp));
+		xfs_alloc_log_recs(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1,
+				   INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT));
+	}
+	/*
+	 * If we joined with the left neighbor, set the buffer in the
+	 * cursor to the left block, and fix up the index.
+	 */
+	if (bp != lbp) {
+		xfs_btree_setbuf(cur, level, lbp);
+		cur->bc_ptrs[level] += INT_GET(left->bb_numrecs, ARCH_CONVERT);
+	}
+	/*
+	 * If we joined with the right neighbor and there's a level above
+	 * us, increment the cursor at that level.
+	 */
+	else if (level + 1 < cur->bc_nlevels &&
+		 (error = xfs_alloc_increment(cur, level + 1, &i)))
+		return error;
+	/*
+	 * Fix up the number of records in the surviving block.
+	 */
+	INT_MOD(left->bb_numrecs, ARCH_CONVERT, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+	/*
+	 * Fix up the right block pointer in the surviving block, and log it.
+	 */
+	left->bb_rightsib = right->bb_rightsib; /* INT_: direct copy */
+	xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+	/*
+	 * If there is a right sibling now, make it point to the
+	 * remaining block.
+	 */
+	if (INT_GET(left->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+		xfs_alloc_block_t	*rrblock;
+		xfs_buf_t		*rrbp;
+
+		if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+				cur->bc_private.a.agno, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0,
+				&rrbp, XFS_ALLOC_BTREE_REF)))
+			return error;
+		rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
+		if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
+			return error;
+		INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, lbno);
+		xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
+	}
+	/*
+	 * Free the deleting block by putting it on the freelist.
+	 */
+	if ((error = xfs_alloc_put_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+			NULL, rbno)))
+		return error;
+	xfs_trans_agbtree_delta(cur->bc_tp, -1);
+	/*
+	 * Adjust the current level's cursor so that we're left referring
+	 * to the right node, after we're done.
+	 * If this leaves the ptr value 0 our caller will fix it up.
+	 */
+	if (level > 0)
+		cur->bc_ptrs[level]--;
+	/*
+	 * Return value means the next level up has something to do.
+	 */
+	*stat = 2;
+	return 0;
+
+error0:
+	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Insert one record/level.  Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int				/* error */
+xfs_alloc_insrec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level to insert record at */
+	xfs_agblock_t		*bnop,	/* i/o: block number inserted */
+	xfs_alloc_rec_t		*recp,	/* i/o: record data inserted */
+	xfs_btree_cur_t		**curp, /* output: new cursor replacing cur */
+	int			*stat)	/* output: success/failure */
+{
+	xfs_agf_t		*agf;	/* allocation group freelist header */
+	xfs_alloc_block_t	*block; /* btree block record/key lives in */
+	xfs_buf_t		*bp;	/* buffer for block */
+	int			error;	/* error return value */
+	int			i;	/* loop index */
+	xfs_alloc_key_t		key;	/* key value being inserted */
+	xfs_alloc_key_t		*kp;	/* pointer to btree keys */
+	xfs_agblock_t		nbno;	/* block number of allocated block */
+	xfs_btree_cur_t		*ncur;	/* new cursor to be used at next lvl */
+	xfs_alloc_key_t		nkey;	/* new key value, from split */
+	xfs_alloc_rec_t		nrec;	/* new record value, for caller */
+	int			optr;	/* old ptr value */
+	xfs_alloc_ptr_t		*pp;	/* pointer to btree addresses */
+	int			ptr;	/* index in btree block for this rec */
+	xfs_alloc_rec_t		*rp;	/* pointer to btree records */
+
+	ASSERT(INT_GET(recp->ar_blockcount, ARCH_CONVERT) > 0);
+	/*
+	 * If we made it to the root level, allocate a new root block
+	 * and we're done.
+	 */
+	if (level >= cur->bc_nlevels) {
+		XFS_STATS_INC(xfsstats.xs_abt_insrec);
+		if ((error = xfs_alloc_newroot(cur, &i)))
+			return error;
+		*bnop = NULLAGBLOCK;
+		*stat = i;
+		return 0;
+	}
+	/*
+	 * Make a key out of the record data to be inserted, and save it.
+	 */
+	key.ar_startblock = recp->ar_startblock; /* INT_: direct copy */
+	key.ar_blockcount = recp->ar_blockcount; /* INT_: direct copy */
+	optr = ptr = cur->bc_ptrs[level];
+	/*
+	 * If we're off the left edge, return failure.
+	 */
+	if (ptr == 0) {
+		*stat = 0;
+		return 0;
+	}
+	XFS_STATS_INC(xfsstats.xs_abt_insrec);
+	/*
+	 * Get pointers to the btree buffer and block.
+	 */
+	bp = cur->bc_bufs[level];
+	block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+		return error;
+	/*
+	 * Check that the new entry is being inserted in the right place.
+	 */
+	if (ptr <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+		if (level == 0) {
+			rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
+			xfs_btree_check_rec(cur->bc_btnum, recp, rp);
+		} else {
+			kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
+			xfs_btree_check_key(cur->bc_btnum, &key, kp);
+		}
+	}
+#endif
+	nbno = NULLAGBLOCK;
+	ncur = (xfs_btree_cur_t *)0;
+	/*
+	 * If the block is full, we can't insert the new entry until we
+	 * make the block un-full.
+	 */
+	if (INT_GET(block->bb_numrecs, ARCH_CONVERT) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
+		/*
+		 * First, try shifting an entry to the right neighbor.
+		 */
+		if ((error = xfs_alloc_rshift(cur, level, &i)))
+			return error;
+		if (i) {
+			/* nothing */
+		}
+		/*
+		 * Next, try shifting an entry to the left neighbor.
+		 */
+		else {
+			if ((error = xfs_alloc_lshift(cur, level, &i)))
+				return error;
+			if (i)
+				optr = ptr = cur->bc_ptrs[level];
+			else {
+				/*
+				 * Next, try splitting the current block in
+				 * half. If this works we have to re-set our
+				 * variables because we could be in a
+				 * different block now.
+				 */
+				if ((error = xfs_alloc_split(cur, level, &nbno,
+						&nkey, &ncur, &i)))
+					return error;
+				if (i) {
+					bp = cur->bc_bufs[level];
+					block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+#ifdef DEBUG
+					if ((error =
+						xfs_btree_check_sblock(cur,
+							block, level, bp)))
+						return error;
+#endif
+					ptr = cur->bc_ptrs[level];
+					nrec.ar_startblock = nkey.ar_startblock; /* INT_: direct copy */
+					nrec.ar_blockcount = nkey.ar_blockcount; /* INT_: direct copy */
+				}
+				/*
+				 * Otherwise the insert fails.
+				 */
+				else {
+					*stat = 0;
+					return 0;
+				}
+			}
+		}
+	}
+	/*
+	 * At this point we know there's room for our new entry in the block
+	 * we're pointing at.
+	 */
+	if (level > 0) {
+		/*
+		 * It's a non-leaf entry.  Make a hole for the new data
+		 * in the key and ptr regions of the block.
+		 */
+		kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
+		pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
+#ifdef DEBUG
+		for (i = INT_GET(block->bb_numrecs, ARCH_CONVERT); i >= ptr; i--) {
+			if ((error = xfs_btree_check_sptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT), level)))
+				return error;
+		}
+#endif
+		ovbcopy(&kp[ptr - 1], &kp[ptr],
+			(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*kp)); /* INT_: copy */
+		ovbcopy(&pp[ptr - 1], &pp[ptr],
+			(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*pp)); /* INT_: copy */
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
+			return error;
+#endif
+		/*
+		 * Now stuff the new data in, bump numrecs and log the new data.
+		 */
+		kp[ptr - 1] = key;
+		INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop);
+		INT_MOD(block->bb_numrecs, ARCH_CONVERT, +1);
+		xfs_alloc_log_keys(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT));
+		xfs_alloc_log_ptrs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT));
+#ifdef DEBUG
+		if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT))
+			xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
+				kp + ptr);
+#endif
+	} else {
+		/*
+		 * It's a leaf entry.  Make a hole for the new record.
+		 */
+		rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
+		ovbcopy(&rp[ptr - 1], &rp[ptr],
+			(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*rp));
+		/*
+		 * Now stuff the new record in, bump numrecs
+		 * and log the new data.
+		 */
+		rp[ptr - 1] = *recp; /* INT_: struct copy */
+		INT_MOD(block->bb_numrecs, ARCH_CONVERT, +1);
+		xfs_alloc_log_recs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT));
+#ifdef DEBUG
+		if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT))
+			xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
+				rp + ptr);
+#endif
+	}
+	/*
+	 * Log the new number of records in the btree header.
+	 */
+	xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
+	/*
+	 * If we inserted at the start of a block, update the parents' keys.
+	 */
+	if (optr == 1 && (error = xfs_alloc_updkey(cur, &key, level + 1)))
+		return error;
+	/*
+	 * Look to see if the longest extent in the allocation group
+	 * needs to be updated.
+	 */
+
+	agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+	if (level == 0 &&
+	    cur->bc_btnum == XFS_BTNUM_CNT &&
+	    INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK &&
+	    INT_GET(recp->ar_blockcount, ARCH_CONVERT) > INT_GET(agf->agf_longest, ARCH_CONVERT)) {
+		/*
+		 * If this is a leaf in the by-size btree and there
+		 * is no right sibling block and this block is bigger
+		 * than the previous longest block, update it.
+		 */
+		INT_COPY(agf->agf_longest, recp->ar_blockcount, ARCH_CONVERT);
+		cur->bc_mp->m_perag[INT_GET(agf->agf_seqno, ARCH_CONVERT)].pagf_longest
+			= INT_GET(recp->ar_blockcount, ARCH_CONVERT);
+		xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
+			XFS_AGF_LONGEST);
+	}
+	/*
+	 * Return the new block number, if any.
+	 * If there is one, give back a record value and a cursor too.
+	 */
+	*bnop = nbno;
+	if (nbno != NULLAGBLOCK) {
+		*recp = nrec; /* INT_: struct copy */
+		*curp = ncur; /* INT_: struct copy */
+	}
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Log header fields from a btree block.
+ */
+STATIC void
+xfs_alloc_log_block(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_buf_t		*bp,	/* buffer containing btree block */
+	int			fields) /* mask of fields: XFS_BB_... */
+{
+	int			first;	/* first byte offset logged */
+	int			last;	/* last byte offset logged */
+	static const short	offsets[] = {	/* table of offsets */
+		offsetof(xfs_alloc_block_t, bb_magic),
+		offsetof(xfs_alloc_block_t, bb_level),
+		offsetof(xfs_alloc_block_t, bb_numrecs),
+		offsetof(xfs_alloc_block_t, bb_leftsib),
+		offsetof(xfs_alloc_block_t, bb_rightsib),
+		sizeof(xfs_alloc_block_t)
+	};
+
+	xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last);
+	xfs_trans_log_buf(tp, bp, first, last);
+}
+
+/*
+ * Log keys from a btree block (nonleaf).
+ */
+STATIC void
+xfs_alloc_log_keys(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_buf_t		*bp,	/* buffer containing btree block */
+	int			kfirst, /* index of first key to log */
+	int			klast)	/* index of last key to log */
+{
+	xfs_alloc_block_t	*block; /* btree block to log from */
+	int			first;	/* first byte offset logged */
+	xfs_alloc_key_t		*kp;	/* key pointer in btree block */
+	int			last;	/* last byte offset logged */
+
+	block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+	kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
+	first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
+	last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
+	xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+}
+
+/*
+ * Log block pointer fields from a btree block (nonleaf).
+ */
+STATIC void
+xfs_alloc_log_ptrs(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_buf_t		*bp,	/* buffer containing btree block */
+	int			pfirst, /* index of first pointer to log */
+	int			plast)	/* index of last pointer to log */
+{
+	xfs_alloc_block_t	*block; /* btree block to log from */
+	int			first;	/* first byte offset logged */
+	int			last;	/* last byte offset logged */
+	xfs_alloc_ptr_t		*pp;	/* block-pointer pointer in btree blk */
+
+	block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+	pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
+	first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
+	last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
+	xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+}
+
+/*
+ * Log records from a btree block (leaf).
+ */
+STATIC void
+xfs_alloc_log_recs(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_buf_t		*bp,	/* buffer containing btree block */
+	int			rfirst, /* index of first record to log */
+	int			rlast)	/* index of last record to log */
+{
+	xfs_alloc_block_t	*block; /* btree block to log from */
+	int			first;	/* first byte offset logged */
+	int			last;	/* last byte offset logged */
+	xfs_alloc_rec_t		*rp;	/* record pointer for btree block */
+
+
+	block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+	rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
+#ifdef DEBUG
+	{
+		xfs_agf_t	*agf;
+		xfs_alloc_rec_t *p;
+
+		agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+		for (p = &rp[rfirst - 1]; p <= &rp[rlast - 1]; p++)
+			ASSERT(INT_GET(p->ar_startblock, ARCH_CONVERT) + INT_GET(p->ar_blockcount, ARCH_CONVERT) <=
+			       INT_GET(agf->agf_length, ARCH_CONVERT));
+	}
+#endif
+	first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
+	last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
+	xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+}
+
+/*
+ * Lookup the record.  The cursor is made to point to it, based on dir.
+ * Return 0 if can't find any such record, 1 for success.
+ */
+STATIC int				/* error */
+xfs_alloc_lookup(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_lookup_t		dir,	/* <=, ==, or >= */
+	int			*stat)	/* success/failure */
+{
+	xfs_agblock_t		agbno;	/* a.g. relative btree block number */
+	xfs_agnumber_t		agno;	/* allocation group number */
+	xfs_alloc_block_t	*block=NULL;	/* current btree block */
+	int			diff;	/* difference for the current key */
+	int			error;	/* error return value */
+	int			keyno=0;	/* current key number */
+	int			level;	/* level in the btree */
+	xfs_mount_t		*mp;	/* file system mount point */
+
+	XFS_STATS_INC(xfsstats.xs_abt_lookup);
+	/*
+	 * Get the allocation group header, and the root block number.
+	 */
+	mp = cur->bc_mp;
+
+	{
+		xfs_agf_t	*agf;	/* a.g. freespace header */
+
+		agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+		agno = INT_GET(agf->agf_seqno, ARCH_CONVERT);
+		agbno = INT_GET(agf->agf_roots[cur->bc_btnum], ARCH_CONVERT);
+	}
+	/*
+	 * Iterate over each level in the btree, starting at the root.
+	 * For each level above the leaves, find the key we need, based
+	 * on the lookup record, then follow the corresponding block
+	 * pointer down to the next level.
+	 */
+	for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+		xfs_buf_t	*bp;	/* buffer pointer for btree block */
+		xfs_daddr_t	d;	/* disk address of btree block */
+
+		/*
+		 * Get the disk address we're looking for.
+		 */
+		d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+		/*
+		 * If the old buffer at this level is for a different block,
+		 * throw it away, otherwise just use it.
+		 */
+		bp = cur->bc_bufs[level];
+		if (bp && XFS_BUF_ADDR(bp) != d)
+			bp = (xfs_buf_t *)0;
+		if (!bp) {
+			/*
+			 * Need to get a new buffer.  Read it, then
+			 * set it in the cursor, releasing the old one.
+			 */
+			if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, agno,
+					agbno, 0, &bp, XFS_ALLOC_BTREE_REF)))
+				return error;
+			xfs_btree_setbuf(cur, level, bp);
+			/*
+			 * Point to the btree block, now that we have the buffer
+			 */
+			block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+			if ((error = xfs_btree_check_sblock(cur, block, level,
+					bp)))
+				return error;
+		} else
+			block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+		/*
+		 * If we already had a key match at a higher level, we know
+		 * we need to use the first entry in this block.
+		 */
+		if (diff == 0)
+			keyno = 1;
+		/*
+		 * Otherwise we need to search this block.  Do a binary search.
+		 */
+		else {
+			int		high;	/* high entry number */
+			xfs_alloc_key_t *kkbase=NULL;/* base of keys in block */
+			xfs_alloc_rec_t *krbase=NULL;/* base of records in block */
+			int		low;	/* low entry number */
+
+			/*
+			 * Get a pointer to keys or records.
+			 */
+			if (level > 0)
+				kkbase = XFS_ALLOC_KEY_ADDR(block, 1, cur);
+			else
+				krbase = XFS_ALLOC_REC_ADDR(block, 1, cur);
+			/*
+			 * Set low and high entry numbers, 1-based.
+			 */
+			low = 1;
+			if (!(high = INT_GET(block->bb_numrecs, ARCH_CONVERT))) {
+				/*
+				 * If the block is empty, the tree must
+				 * be an empty leaf.
+				 */
+				ASSERT(level == 0 && cur->bc_nlevels == 1);
+				cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+				*stat = 0;
+				return 0;
+			}
+			/*
+			 * Binary search the block.
+			 */
+			while (low <= high) {
+				xfs_extlen_t	blockcount;	/* key value */
+				xfs_agblock_t	startblock;	/* key value */
+
+				XFS_STATS_INC(xfsstats.xs_abt_compare);
+				/*
+				 * keyno is average of low and high.
+				 */
+				keyno = (low + high) >> 1;
+				/*
+				 * Get startblock & blockcount.
+				 */
+				if (level > 0) {
+					xfs_alloc_key_t *kkp;
+
+					kkp = kkbase + keyno - 1;
+					startblock = INT_GET(kkp->ar_startblock, ARCH_CONVERT);
+					blockcount = INT_GET(kkp->ar_blockcount, ARCH_CONVERT);
+				} else {
+					xfs_alloc_rec_t *krp;
+
+					krp = krbase + keyno - 1;
+					startblock = INT_GET(krp->ar_startblock, ARCH_CONVERT);
+					blockcount = INT_GET(krp->ar_blockcount, ARCH_CONVERT);
+				}
+				/*
+				 * Compute difference to get next direction.
+				 */
+				if (cur->bc_btnum == XFS_BTNUM_BNO)
+					diff = (int)startblock -
+					       (int)cur->bc_rec.a.ar_startblock;
+				else if (!(diff = (int)blockcount -
+					    (int)cur->bc_rec.a.ar_blockcount))
+					diff = (int)startblock -
+					    (int)cur->bc_rec.a.ar_startblock;
+				/*
+				 * Less than, move right.
+				 */
+				if (diff < 0)
+					low = keyno + 1;
+				/*
+				 * Greater than, move left.
+				 */
+				else if (diff > 0)
+					high = keyno - 1;
+				/*
+				 * Equal, we're done.
+				 */
+				else
+					break;
+			}
+		}
+		/*
+		 * If there are more levels, set up for the next level
+		 * by getting the block number and filling in the cursor.
+		 */
+		if (level > 0) {
+			/*
+			 * If we moved left, need the previous key number,
+			 * unless there isn't one.
+			 */
+			if (diff > 0 && --keyno < 1)
+				keyno = 1;
+			agbno = INT_GET(*XFS_ALLOC_PTR_ADDR(block, keyno, cur), ARCH_CONVERT);
+#ifdef DEBUG
+			if ((error = xfs_btree_check_sptr(cur, agbno, level)))
+				return error;
+#endif
+			cur->bc_ptrs[level] = keyno;
+		}
+	}
+	/*
+	 * Done with the search.
+	 * See if we need to adjust the results.
+	 */
+	if (dir != XFS_LOOKUP_LE && diff < 0) {
+		keyno++;
+		/*
+		 * If ge search and we went off the end of the block, but it's
+		 * not the last block, we're in the wrong block.
+		 */
+		if (dir == XFS_LOOKUP_GE &&
+		    keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT) &&
+		    INT_GET(block->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+			int	i;
+
+			cur->bc_ptrs[0] = keyno;
+			if ((error = xfs_alloc_increment(cur, 0, &i)))
+				return error;
+			XFS_WANT_CORRUPTED_RETURN(i == 1);
+			*stat = 1;
+			return 0;
+		}
+	}
+	else if (dir == XFS_LOOKUP_LE && diff > 0)
+		keyno--;
+	cur->bc_ptrs[0] = keyno;
+	/*
+	 * Return if we succeeded or not.
+	 */
+	if (keyno == 0 || keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT))
+		*stat = 0;
+	else
+		*stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
+	return 0;
+}
+
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int				/* error */
+xfs_alloc_lshift(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level to shift record on */
+	int			*stat)	/* success/failure */
+{
+	int			error;	/* error return value */
+#ifdef DEBUG
+	int			i;	/* loop index */
+#endif
+	xfs_alloc_key_t		key;	/* key value for leaf level upward */
+	xfs_buf_t		*lbp;	/* buffer for left neighbor block */
+	xfs_alloc_block_t	*left;	/* left neighbor btree block */
+	int			nrec;	/* new number of left block entries */
+	xfs_buf_t		*rbp;	/* buffer for right (current) block */
+	xfs_alloc_block_t	*right; /* right (current) btree block */
+	xfs_alloc_key_t		*rkp=NULL;	/* key pointer for right block */
+	xfs_alloc_ptr_t		*rpp=NULL;	/* address pointer for right block */
+	xfs_alloc_rec_t		*rrp=NULL;	/* record pointer for right block */
+
+	/*
+	 * Set up variables for this block as "right".
+	 */
+	rbp = cur->bc_bufs[level];
+	right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
+		return error;
+#endif
+	/*
+	 * If we've got no left sibling then we can't shift an entry left.
+	 */
+	if (INT_GET(right->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * If the cursor entry is the one that would be moved, don't
+	 * do it... it's too complicated.
+	 */
+	if (cur->bc_ptrs[level] <= 1) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * Set up the left neighbor as "left".
+	 */
+	if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+			cur->bc_private.a.agno, INT_GET(right->bb_leftsib, ARCH_CONVERT), 0, &lbp,
+			XFS_ALLOC_BTREE_REF)))
+		return error;
+	left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+	if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+		return error;
+	/*
+	 * If it's full, it can't take another entry.
+	 */
+	if (INT_GET(left->bb_numrecs, ARCH_CONVERT) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
+		*stat = 0;
+		return 0;
+	}
+	nrec = INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1;
+	/*
+	 * If non-leaf, copy a key and a ptr to the left block.
+	 */
+	if (level > 0) {
+		xfs_alloc_key_t *lkp;	/* key pointer for left block */
+		xfs_alloc_ptr_t *lpp;	/* address pointer for left block */
+
+		lkp = XFS_ALLOC_KEY_ADDR(left, nrec, cur);
+		rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
+		*lkp = *rkp;
+		xfs_alloc_log_keys(cur, lbp, nrec, nrec);
+		lpp = XFS_ALLOC_PTR_ADDR(left, nrec, cur);
+		rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sptr(cur, INT_GET(*rpp, ARCH_CONVERT), level)))
+			return error;
+#endif
+		*lpp = *rpp; /* INT_: copy */
+		xfs_alloc_log_ptrs(cur, lbp, nrec, nrec);
+		xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
+	}
+	/*
+	 * If leaf, copy a record to the left block.
+	 */
+	else {
+		xfs_alloc_rec_t *lrp;	/* record pointer for left block */
+
+		lrp = XFS_ALLOC_REC_ADDR(left, nrec, cur);
+		rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
+		*lrp = *rrp;
+		xfs_alloc_log_recs(cur, lbp, nrec, nrec);
+		xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
+	}
+	/*
+	 * Bump and log left's numrecs, decrement and log right's numrecs.
+	 */
+	INT_MOD(left->bb_numrecs, ARCH_CONVERT, +1);
+	xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
+	INT_MOD(right->bb_numrecs, ARCH_CONVERT, -1);
+	xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
+	/*
+	 * Slide the contents of right down one entry.
+	 */
+	if (level > 0) {
+#ifdef DEBUG
+		for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
+			if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i + 1], ARCH_CONVERT),
+					level)))
+				return error;
+		}
+#endif
+		ovbcopy(rkp + 1, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
+		ovbcopy(rpp + 1, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
+		xfs_alloc_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		xfs_alloc_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+	} else {
+		ovbcopy(rrp + 1, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+		xfs_alloc_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		key.ar_startblock = rrp->ar_startblock; /* INT_: direct copy */
+		key.ar_blockcount = rrp->ar_blockcount; /* INT_: direct copy */
+		rkp = &key;
+	}
+	/*
+	 * Update the parent key values of right.
+	 */
+	if ((error = xfs_alloc_updkey(cur, rkp, level + 1)))
+		return error;
+	/*
+	 * Slide the cursor value left one.
+	 */
+	cur->bc_ptrs[level]--;
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Allocate a new root block, fill it in.
+ */
+STATIC int				/* error */
+xfs_alloc_newroot(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			*stat)	/* success/failure */
+{
+	int			error;	/* error return value */
+	xfs_agblock_t		lbno;	/* left block number */
+	xfs_buf_t		*lbp;	/* left btree buffer */
+	xfs_alloc_block_t	*left;	/* left btree block */
+	xfs_mount_t		*mp;	/* mount structure */
+	xfs_agblock_t		nbno;	/* new block number */
+	xfs_buf_t		*nbp;	/* new (root) buffer */
+	xfs_alloc_block_t	*new;	/* new (root) btree block */
+	int			nptr;	/* new value for key index, 1 or 2 */
+	xfs_agblock_t		rbno;	/* right block number */
+	xfs_buf_t		*rbp;	/* right btree buffer */
+	xfs_alloc_block_t	*right; /* right btree block */
+
+	mp = cur->bc_mp;
+
+	ASSERT(cur->bc_nlevels < XFS_AG_MAXLEVELS(mp));
+	/*
+	 * Get a buffer from the freelist blocks, for the new root.
+	 */
+	if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+			&nbno)))
+		return error;
+	/*
+	 * None available, we fail.
+	 */
+	if (nbno == NULLAGBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	xfs_trans_agbtree_delta(cur->bc_tp, 1);
+	nbp = xfs_btree_get_bufs(mp, cur->bc_tp, cur->bc_private.a.agno, nbno,
+		0);
+	new = XFS_BUF_TO_ALLOC_BLOCK(nbp);
+	/*
+	 * Set the root data in the a.g. freespace structure.
+	 */
+	{
+		xfs_agf_t	*agf;	/* a.g. freespace header */
+		xfs_agnumber_t	seqno;
+
+		agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+		INT_SET(agf->agf_roots[cur->bc_btnum], ARCH_CONVERT, nbno);
+		INT_MOD(agf->agf_levels[cur->bc_btnum], ARCH_CONVERT, 1);
+		seqno = INT_GET(agf->agf_seqno, ARCH_CONVERT);
+		mp->m_perag[seqno].pagf_levels[cur->bc_btnum]++;
+		xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
+			XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+	}
+	/*
+	 * At the previous root level there are now two blocks: the old
+	 * root, and the new block generated when it was split.
+	 * We don't know which one the cursor is pointing at, so we
+	 * set up variables "left" and "right" for each case.
+	 */
+	lbp = cur->bc_bufs[cur->bc_nlevels - 1];
+	left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, left, cur->bc_nlevels - 1, lbp)))
+		return error;
+#endif
+	if (INT_GET(left->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+		/*
+		 * Our block is left, pick up the right block.
+		 */
+		lbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(lbp));
+		rbno = INT_GET(left->bb_rightsib, ARCH_CONVERT);
+		if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+				cur->bc_private.a.agno, rbno, 0, &rbp,
+				XFS_ALLOC_BTREE_REF)))
+			return error;
+		right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
+		if ((error = xfs_btree_check_sblock(cur, right,
+				cur->bc_nlevels - 1, rbp)))
+			return error;
+		nptr = 1;
+	} else {
+		/*
+		 * Our block is right, pick up the left block.
+		 */
+		rbp = lbp;
+		right = left;
+		rbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(rbp));
+		lbno = INT_GET(right->bb_leftsib, ARCH_CONVERT);
+		if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+				cur->bc_private.a.agno, lbno, 0, &lbp,
+				XFS_ALLOC_BTREE_REF)))
+			return error;
+		left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+		if ((error = xfs_btree_check_sblock(cur, left,
+				cur->bc_nlevels - 1, lbp)))
+			return error;
+		nptr = 2;
+	}
+	/*
+	 * Fill in the new block's btree header and log it.
+	 */
+	INT_SET(new->bb_magic, ARCH_CONVERT, xfs_magics[cur->bc_btnum]);
+	INT_SET(new->bb_level, ARCH_CONVERT, (__uint16_t)cur->bc_nlevels);
+	INT_SET(new->bb_numrecs, ARCH_CONVERT, 2);
+	INT_SET(new->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK);
+	INT_SET(new->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK);
+	xfs_alloc_log_block(cur->bc_tp, nbp, XFS_BB_ALL_BITS);
+	ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
+	/*
+	 * Fill in the key data in the new root.
+	 */
+	{
+		xfs_alloc_key_t		*kp;	/* btree key pointer */
+
+		kp = XFS_ALLOC_KEY_ADDR(new, 1, cur);
+		if (INT_GET(left->bb_level, ARCH_CONVERT) > 0) {
+			kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur); /* INT_: structure copy */
+			kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur);/* INT_: structure copy */
+		} else {
+			xfs_alloc_rec_t *rp;	/* btree record pointer */
+
+			rp = XFS_ALLOC_REC_ADDR(left, 1, cur);
+			kp[0].ar_startblock = rp->ar_startblock; /* INT_: direct copy */
+			kp[0].ar_blockcount = rp->ar_blockcount; /* INT_: direct copy */
+			rp = XFS_ALLOC_REC_ADDR(right, 1, cur);
+			kp[1].ar_startblock = rp->ar_startblock; /* INT_: direct copy */
+			kp[1].ar_blockcount = rp->ar_blockcount; /* INT_: direct copy */
+		}
+	}
+	xfs_alloc_log_keys(cur, nbp, 1, 2);
+	/*
+	 * Fill in the pointer data in the new root.
+	 */
+	{
+		xfs_alloc_ptr_t		*pp;	/* btree address pointer */
+
+		pp = XFS_ALLOC_PTR_ADDR(new, 1, cur);
+		INT_SET(pp[0], ARCH_CONVERT, lbno);
+		INT_SET(pp[1], ARCH_CONVERT, rbno);
+	}
+	xfs_alloc_log_ptrs(cur, nbp, 1, 2);
+	/*
+	 * Fix up the cursor.
+	 */
+	xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
+	cur->bc_ptrs[cur->bc_nlevels] = nptr;
+	cur->bc_nlevels++;
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int				/* error */
+xfs_alloc_rshift(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level to shift record on */
+	int			*stat)	/* success/failure */
+{
+	int			error;	/* error return value */
+	int			i;	/* loop index */
+	xfs_alloc_key_t		key;	/* key value for leaf level upward */
+	xfs_buf_t		*lbp;	/* buffer for left (current) block */
+	xfs_alloc_block_t	*left;	/* left (current) btree block */
+	xfs_buf_t		*rbp;	/* buffer for right neighbor block */
+	xfs_alloc_block_t	*right; /* right neighbor btree block */
+	xfs_alloc_key_t		*rkp;	/* key pointer for right block */
+	xfs_btree_cur_t		*tcur;	/* temporary cursor */
+
+	/*
+	 * Set up variables for this block as "left".
+	 */
+	lbp = cur->bc_bufs[level];
+	left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+		return error;
+#endif
+	/*
+	 * If we've got no right sibling then we can't shift an entry right.
+	 */
+	if (INT_GET(left->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * If the cursor entry is the one that would be moved, don't
+	 * do it... it's too complicated.
+	 */
+	if (cur->bc_ptrs[level] >= INT_GET(left->bb_numrecs, ARCH_CONVERT)) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * Set up the right neighbor as "right".
+	 */
+	if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+			cur->bc_private.a.agno, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0, &rbp,
+			XFS_ALLOC_BTREE_REF)))
+		return error;
+	right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
+	if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
+		return error;
+	/*
+	 * If it's full, it can't take another entry.
+	 */
+	if (INT_GET(right->bb_numrecs, ARCH_CONVERT) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * Make a hole at the start of the right neighbor block, then
+	 * copy the last left block entry to the hole.
+	 */
+	if (level > 0) {
+		xfs_alloc_key_t *lkp;	/* key pointer for left block */
+		xfs_alloc_ptr_t *lpp;	/* address pointer for left block */
+		xfs_alloc_ptr_t *rpp;	/* address pointer for right block */
+
+		lkp = XFS_ALLOC_KEY_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+		lpp = XFS_ALLOC_PTR_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+		rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
+		rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+		for (i = INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1; i >= 0; i--) {
+			if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level)))
+				return error;
+		}
+#endif
+		ovbcopy(rkp, rkp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
+		ovbcopy(rpp, rpp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sptr(cur, INT_GET(*lpp, ARCH_CONVERT), level)))
+			return error;
+#endif
+		*rkp = *lkp; /* INT_: copy */
+		*rpp = *lpp; /* INT_: copy */
+		xfs_alloc_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+		xfs_alloc_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+		xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
+	} else {
+		xfs_alloc_rec_t *lrp;	/* record pointer for left block */
+		xfs_alloc_rec_t *rrp;	/* record pointer for right block */
+
+		lrp = XFS_ALLOC_REC_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+		rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
+		ovbcopy(rrp, rrp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+		*rrp = *lrp;
+		xfs_alloc_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+		key.ar_startblock = rrp->ar_startblock; /* INT_: direct copy */
+		key.ar_blockcount = rrp->ar_blockcount; /* INT_: direct copy */
+		rkp = &key;
+		xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
+	}
+	/*
+	 * Decrement and log left's numrecs, bump and log right's numrecs.
+	 */
+	INT_MOD(left->bb_numrecs, ARCH_CONVERT, -1);
+	xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
+	INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
+	xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
+	/*
+	 * Using a temporary cursor, update the parent key values of the
+	 * block on the right.
+	 */
+	if ((error = xfs_btree_dup_cursor(cur, &tcur)))
+		return error;
+	i = xfs_btree_lastrec(tcur, level);
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	if ((error = xfs_alloc_increment(tcur, level, &i)) ||
+	    (error = xfs_alloc_updkey(tcur, rkp, level + 1)))
+		goto error0;
+	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+	*stat = 1;
+	return 0;
+error0:
+	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Split cur/level block in half.
+ * Return new block number and its first record (to be inserted into parent).
+ */
+STATIC int				/* error */
+xfs_alloc_split(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level to split */
+	xfs_agblock_t		*bnop,	/* output: block number allocated */
+	xfs_alloc_key_t		*keyp,	/* output: first key of new block */
+	xfs_btree_cur_t		**curp, /* output: new cursor */
+	int			*stat)	/* success/failure */
+{
+	int			error;	/* error return value */
+	int			i;	/* loop index/record number */
+	xfs_agblock_t		lbno;	/* left (current) block number */
+	xfs_buf_t		*lbp;	/* buffer for left block */
+	xfs_alloc_block_t	*left;	/* left (current) btree block */
+	xfs_agblock_t		rbno;	/* right (new) block number */
+	xfs_buf_t		*rbp;	/* buffer for right block */
+	xfs_alloc_block_t	*right; /* right (new) btree block */
+
+	/*
+	 * Allocate the new block from the freelist.
+	 * If we can't do it, we're toast.  Give up.
+	 */
+	if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+			&rbno)))
+		return error;
+	if (rbno == NULLAGBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	xfs_trans_agbtree_delta(cur->bc_tp, 1);
+	rbp = xfs_btree_get_bufs(cur->bc_mp, cur->bc_tp, cur->bc_private.a.agno,
+		rbno, 0);
+	/*
+	 * Set up the new block as "right".
+	 */
+	right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
+	/*
+	 * "Left" is the current (according to the cursor) block.
+	 */
+	lbp = cur->bc_bufs[level];
+	left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+		return error;
+#endif
+	/*
+	 * Fill in the btree header for the new block.
+	 */
+	INT_SET(right->bb_magic, ARCH_CONVERT, xfs_magics[cur->bc_btnum]);
+	right->bb_level = left->bb_level; /* INT_: direct copy */
+	INT_SET(right->bb_numrecs, ARCH_CONVERT, (__uint16_t)(INT_GET(left->bb_numrecs, ARCH_CONVERT) / 2));
+	/*
+	 * Make sure that if there's an odd number of entries now, that
+	 * each new block will have the same number of entries.
+	 */
+	if ((INT_GET(left->bb_numrecs, ARCH_CONVERT) & 1) &&
+	    cur->bc_ptrs[level] <= INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1)
+		INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
+	i = INT_GET(left->bb_numrecs, ARCH_CONVERT) - INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1;
+	/*
+	 * For non-leaf blocks, copy keys and addresses over to the new block.
+	 */
+	if (level > 0) {
+		xfs_alloc_key_t *lkp;	/* left btree key pointer */
+		xfs_alloc_ptr_t *lpp;	/* left btree address pointer */
+		xfs_alloc_key_t *rkp;	/* right btree key pointer */
+		xfs_alloc_ptr_t *rpp;	/* right btree address pointer */
+
+		lkp = XFS_ALLOC_KEY_ADDR(left, i, cur);
+		lpp = XFS_ALLOC_PTR_ADDR(left, i, cur);
+		rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
+		rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+		for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
+			if ((error = xfs_btree_check_sptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level)))
+				return error;
+		}
+#endif
+		bcopy(lkp, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp)); /* INT_: copy */
+		bcopy(lpp, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));/* INT_: copy */
+		xfs_alloc_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		xfs_alloc_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		*keyp = *rkp;
+	}
+	/*
+	 * For leaf blocks, copy records over to the new block.
+	 */
+	else {
+		xfs_alloc_rec_t *lrp;	/* left btree record pointer */
+		xfs_alloc_rec_t *rrp;	/* right btree record pointer */
+
+		lrp = XFS_ALLOC_REC_ADDR(left, i, cur);
+		rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
+		bcopy(lrp, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+		xfs_alloc_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		keyp->ar_startblock = rrp->ar_startblock; /* INT_: direct copy */
+		keyp->ar_blockcount = rrp->ar_blockcount; /* INT_: direct copy */
+	}
+	/*
+	 * Find the left block number by looking in the buffer.
+	 * Adjust numrecs, sibling pointers.
+	 */
+	lbno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(lbp));
+	INT_MOD(left->bb_numrecs, ARCH_CONVERT, -(INT_GET(right->bb_numrecs, ARCH_CONVERT)));
+	right->bb_rightsib = left->bb_rightsib; /* INT_: direct copy */
+	INT_SET(left->bb_rightsib, ARCH_CONVERT, rbno);
+	INT_SET(right->bb_leftsib, ARCH_CONVERT, lbno);
+	xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_ALL_BITS);
+	xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+	/*
+	 * If there's a block to the new block's right, make that block
+	 * point back to right instead of to left.
+	 */
+	if (INT_GET(right->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+		xfs_alloc_block_t	*rrblock;	/* rr btree block */
+		xfs_buf_t		*rrbp;		/* buffer for rrblock */
+
+		if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+				cur->bc_private.a.agno, INT_GET(right->bb_rightsib, ARCH_CONVERT), 0,
+				&rrbp, XFS_ALLOC_BTREE_REF)))
+			return error;
+		rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
+		if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
+			return error;
+		INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, rbno);
+		xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
+	}
+	/*
+	 * If the cursor is really in the right block, move it there.
+	 * If it's just pointing past the last entry in left, then we'll
+	 * insert there, so don't change anything in that case.
+	 */
+	if (cur->bc_ptrs[level] > INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1) {
+		xfs_btree_setbuf(cur, level, rbp);
+		cur->bc_ptrs[level] -= INT_GET(left->bb_numrecs, ARCH_CONVERT);
+	}
+	/*
+	 * If there are more levels, we'll need another cursor which refers to
+	 * the right block, no matter where this cursor was.
+	 */
+	if (level + 1 < cur->bc_nlevels) {
+		if ((error = xfs_btree_dup_cursor(cur, curp)))
+			return error;
+		(*curp)->bc_ptrs[level + 1]++;
+	}
+	*bnop = rbno;
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Update keys at all levels from here to the root along the cursor's path.
+ */
+STATIC int				/* error */
+xfs_alloc_updkey(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_alloc_key_t		*keyp,	/* new key value to update to */
+	int			level)	/* starting level for update */
+{
+	int			ptr;	/* index of key in block */
+
+	/*
+	 * Go up the tree from this level toward the root.
+	 * At each level, update the key value to the value input.
+	 * Stop when we reach a level where the cursor isn't pointing
+	 * at the first entry in the block.
+	 */
+	for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+		xfs_alloc_block_t	*block; /* btree block */
+		xfs_buf_t		*bp;	/* buffer for block */
+#ifdef DEBUG
+		int			error;	/* error return value */
+#endif
+		xfs_alloc_key_t		*kp;	/* ptr to btree block keys */
+
+		bp = cur->bc_bufs[level];
+		block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+			return error;
+#endif
+		ptr = cur->bc_ptrs[level];
+		kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
+		*kp = *keyp;
+		xfs_alloc_log_keys(cur, bp, ptr, ptr);
+	}
+	return 0;
+}
+
+/*
+ * Externally visible routines.
+ */
+
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int					/* error */
+xfs_alloc_decrement(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level in btree, 0 is leaf */
+	int			*stat)	/* success/failure */
+{
+	xfs_alloc_block_t	*block; /* btree block */
+	int			error;	/* error return value */
+	int			lev;	/* btree level */
+
+	ASSERT(level < cur->bc_nlevels);
+	/*
+	 * Read-ahead to the left at this level.
+	 */
+	xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+	/*
+	 * Decrement the ptr at this level.  If we're still in the block
+	 * then we're done.
+	 */
+	if (--cur->bc_ptrs[level] > 0) {
+		*stat = 1;
+		return 0;
+	}
+	/*
+	 * Get a pointer to the btree block.
+	 */
+	block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[level]);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, level,
+			cur->bc_bufs[level])))
+		return error;
+#endif
+	/*
+	 * If we just went off the left edge of the tree, return failure.
+	 */
+	if (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * March up the tree decrementing pointers.
+	 * Stop when we don't go off the left edge of a block.
+	 */
+	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+		if (--cur->bc_ptrs[lev] > 0)
+			break;
+		/*
+		 * Read-ahead the left block, we're going to read it
+		 * in the next loop.
+		 */
+		xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+	}
+	/*
+	 * If we went off the root then we are seriously confused.
+	 */
+	ASSERT(lev < cur->bc_nlevels);
+	/*
+	 * Now walk back down the tree, fixing up the cursor's buffer
+	 * pointers and key numbers.
+	 */
+	for (block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
+		xfs_agblock_t	agbno;	/* block number of btree block */
+		xfs_buf_t	*bp;	/* buffer pointer for block */
+
+		agbno = INT_GET(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
+		if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+				cur->bc_private.a.agno, agbno, 0, &bp,
+				XFS_ALLOC_BTREE_REF)))
+			return error;
+		lev--;
+		xfs_btree_setbuf(cur, lev, bp);
+		block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+		if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
+			return error;
+		cur->bc_ptrs[lev] = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+	}
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int					/* error */
+xfs_alloc_delete(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	int		*stat)		/* success/failure */
+{
+	int		error;		/* error return value */
+	int		i;		/* result code */
+	int		level;		/* btree level */
+
+	/*
+	 * Go up the tree, starting at leaf level.
+	 * If 2 is returned then a join was done; go to the next level.
+	 * Otherwise we are done.
+	 */
+	for (level = 0, i = 2; i == 2; level++) {
+		if ((error = xfs_alloc_delrec(cur, level, &i)))
+			return error;
+	}
+	if (i == 0) {
+		for (level = 1; level < cur->bc_nlevels; level++) {
+			if (cur->bc_ptrs[level] == 0) {
+				if ((error = xfs_alloc_decrement(cur, level, &i)))
+					return error;
+				break;
+			}
+		}
+	}
+	*stat = i;
+	return 0;
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int					/* error */
+xfs_alloc_get_rec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_agblock_t		*bno,	/* output: starting block of extent */
+	xfs_extlen_t		*len,	/* output: length of extent */
+	int			*stat)	/* output: success/failure */
+{
+	xfs_alloc_block_t	*block; /* btree block */
+#ifdef DEBUG
+	int			error;	/* error return value */
+#endif
+	int			ptr;	/* record number */
+
+	ptr = cur->bc_ptrs[0];
+	block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
+		return error;
+#endif
+	/*
+	 * Off the right end or left end, return failure.
+	 */
+	if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT) || ptr <= 0) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * Point to the record and extract its data.
+	 */
+	{
+		xfs_alloc_rec_t		*rec;	/* record data */
+
+		rec = XFS_ALLOC_REC_ADDR(block, ptr, cur);
+		*bno = INT_GET(rec->ar_startblock, ARCH_CONVERT);
+		*len = INT_GET(rec->ar_blockcount, ARCH_CONVERT);
+	}
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int					/* error */
+xfs_alloc_increment(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level in btree, 0 is leaf */
+	int			*stat)	/* success/failure */
+{
+	xfs_alloc_block_t	*block; /* btree block */
+	xfs_buf_t		*bp;	/* tree block buffer */
+	int			error;	/* error return value */
+	int			lev;	/* btree level */
+
+	ASSERT(level < cur->bc_nlevels);
+	/*
+	 * Read-ahead to the right at this level.
+	 */
+	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+	/*
+	 * Get a pointer to the btree block.
+	 */
+	bp = cur->bc_bufs[level];
+	block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+		return error;
+#endif
+	/*
+	 * Increment the ptr at this level.  If we're still in the block
+	 * then we're done.
+	 */
+	if (++cur->bc_ptrs[level] <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+		*stat = 1;
+		return 0;
+	}
+	/*
+	 * If we just went off the right edge of the tree, return failure.
+	 */
+	if (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * March up the tree incrementing pointers.
+	 * Stop when we don't go off the right edge of a block.
+	 */
+	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+		bp = cur->bc_bufs[lev];
+		block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
+			return error;
+#endif
+		if (++cur->bc_ptrs[lev] <= INT_GET(block->bb_numrecs, ARCH_CONVERT))
+			break;
+		/*
+		 * Read-ahead the right block, we're going to read it
+		 * in the next loop.
+		 */
+		xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+	}
+	/*
+	 * If we went off the root then we are seriously confused.
+	 */
+	ASSERT(lev < cur->bc_nlevels);
+	/*
+	 * Now walk back down the tree, fixing up the cursor's buffer
+	 * pointers and key numbers.
+	 */
+	for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+	     lev > level; ) {
+		xfs_agblock_t	agbno;	/* block number of btree block */
+
+		agbno = INT_GET(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
+		if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+				cur->bc_private.a.agno, agbno, 0, &bp,
+				XFS_ALLOC_BTREE_REF)))
+			return error;
+		lev--;
+		xfs_btree_setbuf(cur, lev, bp);
+		block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+		if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
+			return error;
+		cur->bc_ptrs[lev] = 1;
+	}
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Insert the current record at the point referenced by cur.
+ * The cursor may be inconsistent on return if splits have been done.
+ */
+int					/* error */
+xfs_alloc_insert(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	int		*stat)		/* success/failure */
+{
+	int		error;		/* error return value */
+	int		i;		/* result value, 0 for failure */
+	int		level;		/* current level number in btree */
+	xfs_agblock_t	nbno;		/* new block number (split result) */
+	xfs_btree_cur_t *ncur;		/* new cursor (split result) */
+	xfs_alloc_rec_t nrec;		/* record being inserted this level */
+	xfs_btree_cur_t *pcur;		/* previous level's cursor */
+
+	level = 0;
+	nbno = NULLAGBLOCK;
+	INT_SET(nrec.ar_startblock, ARCH_CONVERT, cur->bc_rec.a.ar_startblock);
+	INT_SET(nrec.ar_blockcount, ARCH_CONVERT, cur->bc_rec.a.ar_blockcount);
+	ncur = (xfs_btree_cur_t *)0;
+	pcur = cur;
+	/*
+	 * Loop going up the tree, starting at the leaf level.
+	 * Stop when we don't get a split block, that must mean that
+	 * the insert is finished with this level.
+	 */
+	do {
+		/*
+		 * Insert nrec/nbno into this level of the tree.
+		 * Note if we fail, nbno will be null.
+		 */
+		if ((error = xfs_alloc_insrec(pcur, level++, &nbno, &nrec, &ncur,
+				&i))) {
+			if (pcur != cur)
+				xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+			return error;
+		}
+		/*
+		 * See if the cursor we just used is trash.
+		 * Can't trash the caller's cursor, but otherwise we should
+		 * if ncur is a new cursor or we're about to be done.
+		 */
+		if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
+			cur->bc_nlevels = pcur->bc_nlevels;
+			xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+		}
+		/*
+		 * If we got a new cursor, switch to it.
+		 */
+		if (ncur) {
+			pcur = ncur;
+			ncur = (xfs_btree_cur_t *)0;
+		}
+	} while (nbno != NULLAGBLOCK);
+	*stat = i;
+	return 0;
+}
+
+/*
+ * Lookup the record equal to [bno, len] in the btree given by cur.
+ */
+int					/* error */
+xfs_alloc_lookup_eq(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	xfs_agblock_t	bno,		/* starting block of extent */
+	xfs_extlen_t	len,		/* length of extent */
+	int		*stat)		/* success/failure */
+{
+	cur->bc_rec.a.ar_startblock = bno;
+	cur->bc_rec.a.ar_blockcount = len;
+	return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/*
+ * Lookup the first record greater than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int					/* error */
+xfs_alloc_lookup_ge(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	xfs_agblock_t	bno,		/* starting block of extent */
+	xfs_extlen_t	len,		/* length of extent */
+	int		*stat)		/* success/failure */
+{
+	cur->bc_rec.a.ar_startblock = bno;
+	cur->bc_rec.a.ar_blockcount = len;
+	return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+ * Lookup the first record less than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int					/* error */
+xfs_alloc_lookup_le(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	xfs_agblock_t	bno,		/* starting block of extent */
+	xfs_extlen_t	len,		/* length of extent */
+	int		*stat)		/* success/failure */
+{
+	cur->bc_rec.a.ar_startblock = bno;
+	cur->bc_rec.a.ar_blockcount = len;
+	return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Update the record referred to by cur, to the value given by [bno, len].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+int					/* error */
+xfs_alloc_update(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len)	/* length of extent */
+{
+	xfs_alloc_block_t	*block; /* btree block to update */
+	int			error;	/* error return value */
+	int			ptr;	/* current record number (updating) */
+
+	ASSERT(len > 0);
+	/*
+	 * Pick up the a.g. freelist struct and the current block.
+	 */
+	block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
+		return error;
+#endif
+	/*
+	 * Get the address of the rec to be updated.
+	 */
+	ptr = cur->bc_ptrs[0];
+	{
+		xfs_alloc_rec_t		*rp;	/* pointer to updated record */
+
+		rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
+		/*
+		 * Fill in the new contents and log them.
+		 */
+		INT_SET(rp->ar_startblock, ARCH_CONVERT, bno);
+		INT_SET(rp->ar_blockcount, ARCH_CONVERT, len);
+		xfs_alloc_log_recs(cur, cur->bc_bufs[0], ptr, ptr);
+	}
+	/*
+	 * If it's the by-size btree and it's the last leaf block and
+	 * it's the last record... then update the size of the longest
+	 * extent in the a.g., which we cache in the a.g. freelist header.
+	 */
+	if (cur->bc_btnum == XFS_BTNUM_CNT &&
+	    INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK &&
+	    ptr == INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+		xfs_agf_t	*agf;	/* a.g. freespace header */
+		xfs_agnumber_t	seqno;
+
+		agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+		seqno = INT_GET(agf->agf_seqno, ARCH_CONVERT);
+		cur->bc_mp->m_perag[seqno].pagf_longest = len;
+		INT_SET(agf->agf_longest, ARCH_CONVERT, len);
+		xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
+			XFS_AGF_LONGEST);
+	}
+	/*
+	 * Updating first record in leaf. Pass new key value up to our parent.
+	 */
+	if (ptr == 1) {
+		xfs_alloc_key_t key;	/* key containing [bno, len] */
+
+		INT_SET(key.ar_startblock, ARCH_CONVERT, bno);
+		INT_SET(key.ar_blockcount, ARCH_CONVERT, len);
+		if ((error = xfs_alloc_updkey(cur, &key, 1)))
+			return error;
+	}
+	return 0;
+}
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
new file mode 100644
index 000000000000..e19c3290dcd3
--- /dev/null
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ALLOC_BTREE_H__
+#define __XFS_ALLOC_BTREE_H__
+
+/*
+ * Freespace on-disk structures
+ */
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_btree_sblock;
+struct xfs_mount;
+
+/*
+ * There are two on-disk btrees, one sorted by blockno and one sorted
+ * by blockcount and blockno.  All blocks look the same to make the code
+ * simpler; if we have time later, we'll make the optimizations.
+ */
+#define XFS_ABTB_MAGIC	0x41425442	/* 'ABTB' for bno tree */
+#define XFS_ABTC_MAGIC	0x41425443	/* 'ABTC' for cnt tree */
+
+/*
+ * Data record/key structure
+ */
+typedef struct xfs_alloc_rec
+{
+	xfs_agblock_t	ar_startblock;	/* starting block number */
+	xfs_extlen_t	ar_blockcount;	/* count of free blocks */
+} xfs_alloc_rec_t, xfs_alloc_key_t;
+
+typedef xfs_agblock_t xfs_alloc_ptr_t;	/* btree pointer type */
+					/* btree block header type */
+typedef struct xfs_btree_sblock xfs_alloc_block_t;
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_ALLOC_BLOCK)
+xfs_alloc_block_t *xfs_buf_to_alloc_block(struct xfs_buf *bp);
+#define XFS_BUF_TO_ALLOC_BLOCK(bp)	xfs_buf_to_alloc_block(bp)
+#else
+#define XFS_BUF_TO_ALLOC_BLOCK(bp) ((xfs_alloc_block_t *)(XFS_BUF_PTR(bp)))
+#endif
+
+/*
+ * Real block structures have a size equal to the disk block size.
+ */
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_BLOCK_SIZE)
+int xfs_alloc_block_size(int lev, struct xfs_btree_cur *cur);
+#define XFS_ALLOC_BLOCK_SIZE(lev,cur)	xfs_alloc_block_size(lev,cur)
+#else
+#define XFS_ALLOC_BLOCK_SIZE(lev,cur)	(1 << (cur)->bc_blocklog)
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_BLOCK_MAXRECS)
+int xfs_alloc_block_maxrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_ALLOC_BLOCK_MAXRECS(lev,cur)	xfs_alloc_block_maxrecs(lev,cur)
+#else
+#define XFS_ALLOC_BLOCK_MAXRECS(lev,cur)	\
+	((cur)->bc_mp->m_alloc_mxr[lev != 0])
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_BLOCK_MINRECS)
+int xfs_alloc_block_minrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_ALLOC_BLOCK_MINRECS(lev,cur)	xfs_alloc_block_minrecs(lev,cur)
+#else
+#define XFS_ALLOC_BLOCK_MINRECS(lev,cur)	\
+	((cur)->bc_mp->m_alloc_mnr[lev != 0])
+#endif
+
+/*
+ * Minimum and maximum blocksize.
+ * The blocksize upper limit is pretty much arbitrary.
+ */
+#define XFS_MIN_BLOCKSIZE_LOG	9	/* i.e. 512 bytes */
+#define XFS_MAX_BLOCKSIZE_LOG	16	/* i.e. 65536 bytes */
+#define XFS_MIN_BLOCKSIZE	(1 << XFS_MIN_BLOCKSIZE_LOG)
+#define XFS_MAX_BLOCKSIZE	(1 << XFS_MAX_BLOCKSIZE_LOG)
+
+/*
+ * block numbers in the AG; SB is BB 0, AGF is BB 1, AGI is BB 2, AGFL is BB 3
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BNO_BLOCK)
+xfs_agblock_t xfs_bno_block(struct xfs_mount *mp);
+#define XFS_BNO_BLOCK(mp)	xfs_bno_block(mp)
+#else
+#define XFS_BNO_BLOCK(mp)	((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CNT_BLOCK)
+xfs_agblock_t xfs_cnt_block(struct xfs_mount *mp);
+#define XFS_CNT_BLOCK(mp)	xfs_cnt_block(mp)
+#else
+#define XFS_CNT_BLOCK(mp)	((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
+#endif
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_REC_ADDR)
+xfs_alloc_rec_t *xfs_alloc_rec_addr(xfs_alloc_block_t *bb, int i,
+				    struct xfs_btree_cur *cur);
+#define XFS_ALLOC_REC_ADDR(bb,i,cur)	xfs_alloc_rec_addr(bb,i,cur)
+#else
+#define XFS_ALLOC_REC_ADDR(bb,i,cur)	\
+	XFS_BTREE_REC_ADDR(XFS_ALLOC_BLOCK_SIZE(0,cur), xfs_alloc, bb, i, \
+		XFS_ALLOC_BLOCK_MAXRECS(0, cur))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_KEY_ADDR)
+xfs_alloc_key_t *xfs_alloc_key_addr(xfs_alloc_block_t *bb, int i,
+				    struct xfs_btree_cur *cur);
+#define XFS_ALLOC_KEY_ADDR(bb,i,cur)	xfs_alloc_key_addr(bb,i,cur)
+#else
+#define XFS_ALLOC_KEY_ADDR(bb,i,cur)	\
+	XFS_BTREE_KEY_ADDR(XFS_ALLOC_BLOCK_SIZE(1,cur), xfs_alloc, bb, i, \
+		XFS_ALLOC_BLOCK_MAXRECS(1, cur))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ALLOC_PTR_ADDR)
+xfs_alloc_ptr_t *xfs_alloc_ptr_addr(xfs_alloc_block_t *bb, int i,
+				    struct xfs_btree_cur *cur);
+#define XFS_ALLOC_PTR_ADDR(bb,i,cur)	xfs_alloc_ptr_addr(bb,i,cur)
+#else
+#define XFS_ALLOC_PTR_ADDR(bb,i,cur)	\
+	XFS_BTREE_PTR_ADDR(XFS_ALLOC_BLOCK_SIZE(1,cur), xfs_alloc, bb, i, \
+		XFS_ALLOC_BLOCK_MAXRECS(1, cur))
+#endif
+
+/*
+ * Prototypes for externally visible routines.
+ */
+
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int					/* error */
+xfs_alloc_decrement(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level,	/* level in btree, 0 is leaf */
+	int			*stat); /* success/failure */
+
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int					/* error */
+xfs_alloc_delete(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			*stat); /* success/failure */
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int					/* error */
+xfs_alloc_get_rec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		*bno,	/* output: starting block of extent */
+	xfs_extlen_t		*len,	/* output: length of extent */
+	int			*stat); /* output: success/failure */
+
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int					/* error */
+xfs_alloc_increment(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level,	/* level in btree, 0 is leaf */
+	int			*stat); /* success/failure */
+
+/*
+ * Insert the current record at the point referenced by cur.
+ * The cursor may be inconsistent on return if splits have been done.
+ */
+int					/* error */
+xfs_alloc_insert(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			*stat); /* success/failure */
+
+/*
+ * Lookup the record equal to [bno, len] in the btree given by cur.
+ */
+int					/* error */
+xfs_alloc_lookup_eq(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat); /* success/failure */
+
+/*
+ * Lookup the first record greater than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int					/* error */
+xfs_alloc_lookup_ge(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat); /* success/failure */
+
+/*
+ * Lookup the first record less than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+int					/* error */
+xfs_alloc_lookup_le(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len,	/* length of extent */
+	int			*stat); /* success/failure */
+
+/*
+ * Update the record referred to by cur, to the value given by [bno, len].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+int					/* error */
+xfs_alloc_update(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agblock_t		bno,	/* starting block of extent */
+	xfs_extlen_t		len);	/* length of extent */
+
+#endif	/* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
new file mode 100644
index 000000000000..57a28544be96
--- /dev/null
+++ b/fs/xfs/xfs_arch.h
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ARCH_H__
+#define __XFS_ARCH_H__
+
+#ifndef XFS_BIG_FILESYSTEMS
+#error XFS_BIG_FILESYSTEMS must be defined true or false
+#endif
+
+#ifdef __KERNEL__
+
+#include <asm/byteorder.h>
+
+#ifdef __LITTLE_ENDIAN
+# define __BYTE_ORDER	__LITTLE_ENDIAN
+#endif
+#ifdef __BIG_ENDIAN
+# define __BYTE_ORDER	__BIG_ENDIAN
+#endif
+
+#endif	/* __KERNEL__ */
+
+/* do we need conversion? */
+
+#define ARCH_NOCONVERT 1
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define ARCH_CONVERT   0
+#else
+#define ARCH_CONVERT   ARCH_NOCONVERT
+#endif
+
+/* generic swapping macros */
+
+#define INT_SWAP16(type,var) ((typeof(type))(__swab16((__u16)(var))))
+#define INT_SWAP32(type,var) ((typeof(type))(__swab32((__u32)(var))))
+#define INT_SWAP64(type,var) ((typeof(type))(__swab64((__u64)(var))))
+
+#define INT_SWAP(type, var) \
+    ((sizeof(type) == 8) ? INT_SWAP64(type,var) : \
+    ((sizeof(type) == 4) ? INT_SWAP32(type,var) : \
+    ((sizeof(type) == 2) ? INT_SWAP16(type,var) : \
+    (var))))
+
+#define INT_SWAP_UNALIGNED_32(from,to) \
+    { \
+	((__u8*)(to))[0] = ((__u8*)(from))[3]; \
+	((__u8*)(to))[1] = ((__u8*)(from))[2]; \
+	((__u8*)(to))[2] = ((__u8*)(from))[1]; \
+	((__u8*)(to))[3] = ((__u8*)(from))[0]; \
+    }
+
+#define INT_SWAP_UNALIGNED_64(from,to) \
+    { \
+	INT_SWAP_UNALIGNED_32( ((__u8*)(from)) + 4, ((__u8*)(to))); \
+	INT_SWAP_UNALIGNED_32( ((__u8*)(from)), ((__u8*)(to)) + 4); \
+    }
+
+/*
+ * get and set integers from potentially unaligned locations
+ */
+
+#define INT_GET_UNALIGNED_16_LE(pointer) \
+   ((__u16)((((__u8*)(pointer))[0]	) | (((__u8*)(pointer))[1] << 8 )))
+#define INT_GET_UNALIGNED_16_BE(pointer) \
+   ((__u16)((((__u8*)(pointer))[0] << 8) | (((__u8*)(pointer))[1])))
+#define INT_SET_UNALIGNED_16_LE(pointer,value) \
+    { \
+	((__u8*)(pointer))[0] = (((value)     ) & 0xff); \
+	((__u8*)(pointer))[1] = (((value) >> 8) & 0xff); \
+    }
+#define INT_SET_UNALIGNED_16_BE(pointer,value) \
+    { \
+	((__u8*)(pointer))[0] = (((value) >> 8) & 0xff); \
+	((__u8*)(pointer))[1] = (((value)     ) & 0xff); \
+    }
+
+#define INT_GET_UNALIGNED_32_LE(pointer) \
+   ((__u32)((((__u8*)(pointer))[0]	) | (((__u8*)(pointer))[1] << 8 ) \
+	   |(((__u8*)(pointer))[2] << 16) | (((__u8*)(pointer))[3] << 24)))
+#define INT_GET_UNALIGNED_32_BE(pointer) \
+   ((__u32)((((__u8*)(pointer))[0] << 24) | (((__u8*)(pointer))[1] << 16) \
+	   |(((__u8*)(pointer))[2] << 8)  | (((__u8*)(pointer))[3]	)))
+
+#define INT_GET_UNALIGNED_64_LE(pointer) \
+   (((__u64)(INT_GET_UNALIGNED_32_LE(((__u8*)(pointer))+4)) << 32 ) \
+   |((__u64)(INT_GET_UNALIGNED_32_LE(((__u8*)(pointer))	 ))	  ))
+#define INT_GET_UNALIGNED_64_BE(pointer) \
+   (((__u64)(INT_GET_UNALIGNED_32_BE(((__u8*)(pointer))	 )) << 32  ) \
+   |((__u64)(INT_GET_UNALIGNED_32_BE(((__u8*)(pointer))+4))	   ))
+
+/*
+ * now pick the right ones for our MACHINE ARCHITECTURE
+ */
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define INT_GET_UNALIGNED_16(pointer)	    INT_GET_UNALIGNED_16_LE(pointer)
+#define INT_SET_UNALIGNED_16(pointer,value) INT_SET_UNALIGNED_16_LE(pointer,value)
+#define INT_GET_UNALIGNED_32(pointer)	    INT_GET_UNALIGNED_32_LE(pointer)
+#define INT_GET_UNALIGNED_64(pointer)	    INT_GET_UNALIGNED_64_LE(pointer)
+#else
+#define INT_GET_UNALIGNED_16(pointer)	    INT_GET_UNALIGNED_16_BE(pointer)
+#define INT_SET_UNALIGNED_16(pointer,value) INT_SET_UNALIGNED_16_BE(pointer,value)
+#define INT_GET_UNALIGNED_32(pointer)	    INT_GET_UNALIGNED_32_BE(pointer)
+#define INT_GET_UNALIGNED_64(pointer)	    INT_GET_UNALIGNED_64_BE(pointer)
+#endif
+
+/* define generic INT_ macros */
+
+#define INT_GET(reference,arch) \
+    (((arch) == ARCH_NOCONVERT) \
+	? \
+	    (reference) \
+	: \
+	    INT_SWAP((reference),(reference)) \
+    )
+
+/* does not return a value */
+#define INT_SET(reference,arch,valueref) \
+    (__builtin_constant_p(valueref) ? \
+	(void)( (reference) = ( ((arch) != ARCH_NOCONVERT) ? (INT_SWAP((reference),(valueref))) : (valueref)) ) : \
+	(void)( \
+	    ((reference) = (valueref)), \
+	    ( ((arch) != ARCH_NOCONVERT) ? (reference) = INT_SWAP((reference),(reference)) : 0 ) \
+	) \
+    )
+
+/* does not return a value */
+#define INT_MOD_EXPR(reference,arch,code) \
+    (void)(((arch) == ARCH_NOCONVERT) \
+	? \
+	    ((reference) code) \
+	: \
+	    ( \
+		(reference) = INT_GET((reference),arch) , \
+		((reference) code), \
+		INT_SET(reference, arch, reference) \
+	    ) \
+    )
+
+/* does not return a value */
+#define INT_MOD(reference,arch,delta) \
+    (void)( \
+	INT_MOD_EXPR(reference,arch,+=(delta)) \
+    )
+
+/*
+ * INT_COPY - copy a value between two locations with the
+ *	      _same architecture_ but _potentially different sizes_
+ *
+ *	    if the types of the two parameters are equal or they are
+ *		in native architecture, a simple copy is done
+ *
+ *	    otherwise, architecture conversions are done
+ *
+ */
+
+/* does not return a value */
+#define INT_COPY(dst,src,arch) \
+    (void)( \
+	((sizeof(dst) == sizeof(src)) || ((arch) == ARCH_NOCONVERT)) \
+	    ? \
+		((dst) = (src)) \
+	    : \
+		INT_SET(dst, arch, INT_GET(src, arch)) \
+    )
+
+/*
+ * INT_XLATE - copy a value in either direction between two locations
+ *	       with different architectures
+ *
+ *		    dir < 0	- copy from memory to buffer (native to arch)
+ *		    dir > 0	- copy from buffer to memory (arch to native)
+ */
+
+/* does not return a value */
+#define INT_XLATE(buf,mem,dir,arch) {\
+    ASSERT(dir); \
+    if (dir>0) { \
+	(mem)=INT_GET(buf, arch); \
+    } else { \
+	INT_SET(buf, arch, mem); \
+    } \
+}
+
+#define INT_ISZERO(reference,arch) \
+    ((reference) == 0)
+
+#define INT_ZERO(reference,arch) \
+    ((reference) = 0)
+
+#define INT_GET_UNALIGNED_16_ARCH(pointer,arch) \
+    ( ((arch) == ARCH_NOCONVERT) \
+	? \
+	    (INT_GET_UNALIGNED_16(pointer)) \
+	: \
+	    (INT_GET_UNALIGNED_16_BE(pointer)) \
+    )
+#define INT_SET_UNALIGNED_16_ARCH(pointer,value,arch) \
+    if ((arch) == ARCH_NOCONVERT) { \
+	INT_SET_UNALIGNED_16(pointer,value); \
+    } else { \
+	INT_SET_UNALIGNED_16_BE(pointer,value); \
+    }
+
+#define DIRINO4_GET_ARCH(pointer,arch) \
+    ( ((arch) == ARCH_NOCONVERT) \
+	? \
+	    (INT_GET_UNALIGNED_32(pointer)) \
+	: \
+	    (INT_GET_UNALIGNED_32_BE(pointer)) \
+    )
+
+#if XFS_BIG_FILESYSTEMS
+#define DIRINO_GET_ARCH(pointer,arch) \
+    ( ((arch) == ARCH_NOCONVERT) \
+	? \
+	    (INT_GET_UNALIGNED_64(pointer)) \
+	: \
+	    (INT_GET_UNALIGNED_64_BE(pointer)) \
+    )
+#else
+/* MACHINE ARCHITECTURE dependent */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define DIRINO_GET_ARCH(pointer,arch) \
+    DIRINO4_GET_ARCH((((__u8*)pointer)+4),arch)
+#else
+#define DIRINO_GET_ARCH(pointer,arch) \
+    DIRINO4_GET_ARCH(pointer,arch)
+#endif
+#endif
+
+#define DIRINO_COPY_ARCH(from,to,arch) \
+    if ((arch) == ARCH_NOCONVERT) { \
+	bcopy(from,to,sizeof(xfs_ino_t)); \
+    } else { \
+	INT_SWAP_UNALIGNED_64(from,to); \
+    }
+#define DIRINO4_COPY_ARCH(from,to,arch) \
+    if ((arch) == ARCH_NOCONVERT) { \
+	bcopy((((__u8*)from+4)),to,sizeof(xfs_dir2_ino4_t)); \
+    } else { \
+	INT_SWAP_UNALIGNED_32(from,to); \
+    }
+
+#endif	/* __XFS_ARCH_H__ */
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
new file mode 100644
index 000000000000..74563b62d3fc
--- /dev/null
+++ b/fs/xfs/xfs_attr.c
@@ -0,0 +1,2294 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+/*
+ * xfs_attr.c
+ *
+ * Provide the external interfaces to manage attribute lists.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Internal routines when attribute list fits inside the inode.
+ */
+STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
+
+/*
+ * Internal routines when attribute list is one block.
+ */
+STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
+STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context);
+
+/*
+ * Internal routines when attribute list is more than one block.
+ */
+STATIC int xfs_attr_node_addname(xfs_da_args_t *args);
+STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
+STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context);
+STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
+STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
+
+/*
+ * Routines to manipulate out-of-line attribute values.
+ */
+STATIC int xfs_attr_rmtval_get(xfs_da_args_t *args);
+STATIC int xfs_attr_rmtval_set(xfs_da_args_t *args);
+STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args);
+
+#define ATTR_RMTVALUE_MAPSIZE	1	/* # of map entries at once */
+#define ATTR_RMTVALUE_TRANSBLKS 8	/* max # of blks in a transaction */
+
+#if defined(DEBUG)
+ktrace_t *xfs_attr_trace_buf;
+#endif
+
+
+
+/*========================================================================
+ * Overall external interface routines.
+ *========================================================================*/
+
+/*ARGSUSED*/
+int								/* error */
+xfs_attr_get(bhv_desc_t *bdp, char *name, char *value, int *valuelenp,
+	     int flags, struct cred *cred)
+{
+	xfs_da_args_t	args;
+	int		error;
+	int		namelen;
+	xfs_inode_t	*ip = XFS_BHVTOI(bdp);
+
+	if (!name)
+		return EINVAL;
+	ASSERT(MAXNAMELEN-1 <= 0xff);	/* length is stored in uint8 */
+	namelen = strlen(name);
+	if (namelen >= MAXNAMELEN)
+		return EFAULT;		/* match IRIX behaviour */
+	XFS_STATS_INC(xfsstats.xs_attr_get);
+
+	if (XFS_IFORK_Q(ip) == 0)
+		return ENOATTR;
+
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return (EIO);
+
+	/*
+	 * Do we answer them, or ignore them?
+	 */
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	if ((error = xfs_iaccess(XFS_BHVTOI(bdp), IREAD, cred))) {
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		return(XFS_ERROR(error));
+	}
+
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	bzero((char *)&args, sizeof(args));
+	args.name = name;
+	args.namelen = namelen;
+	args.value = value;
+	args.valuelen = *valuelenp;
+	args.flags = flags;
+	args.hashval = xfs_da_hashname(args.name, args.namelen);
+	args.dp = ip;
+	args.whichfork = XFS_ATTR_FORK;
+	args.trans = NULL;
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (XFS_IFORK_Q(ip) == 0 ||
+	    (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+	     ip->i_d.di_anextents == 0)) {
+		error = XFS_ERROR(ENOATTR);
+	} else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+		error = xfs_attr_shortform_getvalue(&args);
+	} else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) {
+		error = xfs_attr_leaf_get(&args);
+	} else {
+		error = xfs_attr_node_get(&args);
+	}
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	/*
+	 * Return the number of bytes in the value to the caller.
+	 */
+	*valuelenp = args.valuelen;
+
+	if (error == EEXIST)
+		error = 0;
+	return(error);
+}
+
+/*ARGSUSED*/
+int								/* error */
+xfs_attr_set(bhv_desc_t *bdp, char *name, char *value, int valuelen, int flags,
+		     struct cred *cred)
+{
+	xfs_da_args_t	args;
+	xfs_inode_t	*dp;
+	xfs_fsblock_t	firstblock;
+	xfs_bmap_free_t flist;
+	int		error, err2, committed;
+	int		local, size;
+	uint		nblks;
+	xfs_mount_t	*mp;
+	int		rsvd = (flags & ATTR_ROOT) != 0;
+	int		namelen;
+
+	ASSERT(MAXNAMELEN-1 <= 0xff); /* length is stored in uint8 */
+	namelen = strlen(name);
+	if (namelen >= MAXNAMELEN)
+		return EFAULT; /* match irix behaviour */
+
+	XFS_STATS_INC(xfsstats.xs_attr_set);
+	/*
+	 * Do we answer them, or ignore them?
+	 */
+	dp = XFS_BHVTOI(bdp);
+	mp = dp->i_mount;
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return (EIO);
+
+	xfs_ilock(dp, XFS_ILOCK_SHARED);
+	if ((error = xfs_iaccess(dp, IWRITE, cred))) {
+		xfs_iunlock(dp, XFS_ILOCK_SHARED);
+		return(XFS_ERROR(error));
+	}
+	xfs_iunlock(dp, XFS_ILOCK_SHARED);
+
+	/*
+	 * Attach the dquots to the inode.
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if ((error = xfs_qm_dqattach(dp, 0)))
+			return (error);
+	}
+
+	/*
+	 * If the inode doesn't have an attribute fork, add one.
+	 * (inode must not be locked when we call this routine)
+	 */
+	if (XFS_IFORK_Q(dp) == 0) {
+		error = xfs_bmap_add_attrfork(dp, rsvd);
+		if (error)
+			return(error);
+	}
+
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	bzero((char *)&args, sizeof(args));
+	args.name = name;
+	args.namelen = namelen;
+	args.value = value;
+	args.valuelen = valuelen;
+	args.flags = flags;
+	args.hashval = xfs_da_hashname(args.name, args.namelen);
+	args.dp = dp;
+	args.firstblock = &firstblock;
+	args.flist = &flist;
+	args.whichfork = XFS_ATTR_FORK;
+	args.oknoent = 1;
+
+	/* Determine space new attribute will use, and if it will be inline
+	 * or out of line.
+	 */
+	size = xfs_attr_leaf_newentsize(&args, mp->m_sb.sb_blocksize, &local);
+
+	nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+	if (local) {
+		if (size > (mp->m_sb.sb_blocksize >> 1)) {
+			/* Double split possible */
+			nblks <<= 1;
+		}
+	} else {
+		uint	dblocks = XFS_B_TO_FSB(mp, valuelen);
+		/* Out of line attribute, cannot double split, but make
+		 * room for the attribute value itself.
+		 */
+		nblks += dblocks;
+		nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
+	}
+
+	/* Size is now blocks for attribute data */
+	args.total = nblks;
+
+	/*
+	 * Start our first transaction of the day.
+	 *
+	 * All future transactions during this code must be "chained" off
+	 * this one via the trans_dup() call.  All transactions will contain
+	 * the inode, and the inode will always be marked with trans_ihold().
+	 * Since the inode will be locked in all transactions, we must log
+	 * the inode in every transaction to let it float upward through
+	 * the log.
+	 */
+	args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET);
+
+	/*
+	 * Root fork attributes can use reserved data blocks for this
+	 * operation if necessary
+	 */
+
+	if (rsvd)
+		args.trans->t_flags |= XFS_TRANS_RESERVE;
+
+	if ((error = xfs_trans_reserve(args.trans, (uint) nblks,
+				      XFS_ATTRSET_LOG_RES(mp, nblks),
+				      0, XFS_TRANS_PERM_LOG_RES,
+				      XFS_ATTRSET_LOG_COUNT))) {
+		xfs_trans_cancel(args.trans, 0);
+		return(error);
+	}
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if (rsvd) {
+			error = xfs_trans_reserve_blkquota_force(args.trans,
+					dp, nblks);
+		} else {
+			error = xfs_trans_reserve_blkquota(args.trans,
+					dp, nblks);
+		}
+		if (error) {
+			xfs_iunlock(dp, XFS_ILOCK_EXCL);
+			xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
+			return (error);
+		}
+	}
+
+	xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
+	xfs_trans_ihold(args.trans, dp);
+
+	/*
+	 * If the attribute list is non-existant or a shortform list,
+	 * upgrade it to a single-leaf-block attribute list.
+	 */
+	if ((dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
+	    ((dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) &&
+	     (dp->i_d.di_anextents == 0))) {
+
+		/*
+		 * Build initial attribute list (if required).
+		 */
+		if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
+			(void)xfs_attr_shortform_create(&args);
+
+		/*
+		 * Try to add the attr to the attribute list in
+		 * the inode.
+		 */
+		error = xfs_attr_shortform_addname(&args);
+		if (error != ENOSPC) {
+			/*
+			 * Commit the shortform mods, and we're done.
+			 * NOTE: this is also the error path (EEXIST, etc).
+			 */
+			ASSERT(args.trans != NULL);
+
+			/*
+			 * If this is a synchronous mount, make sure that
+			 * the transaction goes to disk before returning
+			 * to the user.
+			 */
+			if (mp->m_flags & XFS_MOUNT_WSYNC) {
+				xfs_trans_set_sync(args.trans);
+			}
+			err2 = xfs_trans_commit(args.trans,
+						 XFS_TRANS_RELEASE_LOG_RES,
+						 NULL);
+			xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+			/*
+			 * Hit the inode change time.
+			 */
+			if (!error && (flags & ATTR_KERNOTIME) == 0) {
+				xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
+			}
+			return(error == 0 ? err2 : error);
+		}
+
+		/*
+		 * It won't fit in the shortform, transform to a leaf block.
+		 * GROT: another possible req'mt for a double-split btree op.
+		 */
+		XFS_BMAP_INIT(args.flist, args.firstblock);
+		error = xfs_attr_shortform_to_leaf(&args);
+		if (!error) {
+			error = xfs_bmap_finish(&args.trans, args.flist,
+						*args.firstblock, &committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args.trans = NULL;
+			xfs_bmap_cancel(&flist);
+			goto out;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed) {
+			xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
+			xfs_trans_ihold(args.trans, dp);
+		}
+
+		/*
+		 * Commit the leaf transformation.  We'll need another (linked)
+		 * transaction to add the new attribute to the leaf.
+		 */
+		if ((error = xfs_attr_rolltrans(&args.trans, dp)))
+			goto out;
+
+	}
+
+	if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+		error = xfs_attr_leaf_addname(&args);
+	} else {
+		error = xfs_attr_node_addname(&args);
+	}
+	if (error) {
+		goto out;
+	}
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * transaction goes to disk before returning to the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC) {
+		xfs_trans_set_sync(args.trans);
+	}
+
+	/*
+	 * Commit the last in the sequence of transactions.
+	 */
+	xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
+	error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES,
+				 NULL);
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+	/*
+	 * Hit the inode change time.
+	 */
+	if (!error && (flags & ATTR_KERNOTIME) == 0) {
+		xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
+	}
+
+	return(error);
+
+out:
+	if (args.trans)
+		xfs_trans_cancel(args.trans,
+			XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	return(error);
+}
+
+/*
+ * Generic handler routine to remove a name from an attribute list.
+ * Transitions attribute list from Btree to shortform as necessary.
+ */
+/*ARGSUSED*/
+int								/* error */
+xfs_attr_remove(bhv_desc_t *bdp, char *name, int flags, struct cred *cred)
+{
+	xfs_da_args_t	    args;
+	xfs_inode_t	    *dp;
+	xfs_fsblock_t	    firstblock;
+	xfs_bmap_free_t	    flist;
+	int		    error;
+	xfs_mount_t	    *mp;
+	int		    namelen;
+
+	ASSERT(MAXNAMELEN-1<=0xff); /* length is stored in uint8 */
+	namelen = strlen(name);
+	if (namelen>=MAXNAMELEN)
+		return EFAULT; /* match irix behaviour */
+
+	XFS_STATS_INC(xfsstats.xs_attr_remove);
+
+	/*
+	 * Do we answer them, or ignore them?
+	 */
+	dp = XFS_BHVTOI(bdp);
+	mp = dp->i_mount;
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return (EIO);
+
+	xfs_ilock(dp, XFS_ILOCK_SHARED);
+	if ((error = xfs_iaccess(dp, IWRITE, cred))) {
+		xfs_iunlock(dp, XFS_ILOCK_SHARED);
+		return(XFS_ERROR(error));
+	} else if (XFS_IFORK_Q(dp) == 0 ||
+		   (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+		    dp->i_d.di_anextents == 0)) {
+		xfs_iunlock(dp, XFS_ILOCK_SHARED);
+		return(XFS_ERROR(ENOATTR));
+	}
+	xfs_iunlock(dp, XFS_ILOCK_SHARED);
+
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	bzero((char *)&args, sizeof(args));
+	args.name = name;
+	args.namelen = namelen;
+	args.flags = flags;
+	args.hashval = xfs_da_hashname(args.name, args.namelen);
+	args.dp = dp;
+	args.firstblock = &firstblock;
+	args.flist = &flist;
+	args.total = 0;
+	args.whichfork = XFS_ATTR_FORK;
+
+	/*
+	 * Attach the dquots to the inode.
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if (XFS_NOT_DQATTACHED(mp, dp)) {
+			if ((error = xfs_qm_dqattach(dp, 0)))
+				return (error);
+		}
+	}
+	/*
+	 * Start our first transaction of the day.
+	 *
+	 * All future transactions during this code must be "chained" off
+	 * this one via the trans_dup() call.  All transactions will contain
+	 * the inode, and the inode will always be marked with trans_ihold().
+	 * Since the inode will be locked in all transactions, we must log
+	 * the inode in every transaction to let it float upward through
+	 * the log.
+	 */
+	args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM);
+
+	/*
+	 * Root fork attributes can use reserved data blocks for this
+	 * operation if necessary
+	 */
+
+	if (flags & ATTR_ROOT)
+		args.trans->t_flags |= XFS_TRANS_RESERVE;
+
+	if ((error = xfs_trans_reserve(args.trans,
+				      XFS_ATTRRM_SPACE_RES(mp),
+				      XFS_ATTRRM_LOG_RES(mp),
+				      0, XFS_TRANS_PERM_LOG_RES,
+				      XFS_ATTRRM_LOG_COUNT))) {
+		xfs_trans_cancel(args.trans, 0);
+		return(error);
+
+	}
+
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+	/*
+	 * No need to make quota reservations here. We expect to release some
+	 * blocks not allocate in the common case.
+	 */
+	xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL);
+	xfs_trans_ihold(args.trans, dp);
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (XFS_IFORK_Q(dp) == 0 ||
+	    (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+	     dp->i_d.di_anextents == 0)) {
+		error = XFS_ERROR(ENOATTR);
+		goto out;
+	}
+	if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+		ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
+		error = xfs_attr_shortform_remove(&args);
+		if (error) {
+			goto out;
+		}
+	} else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+		error = xfs_attr_leaf_removename(&args);
+	} else {
+		error = xfs_attr_node_removename(&args);
+	}
+	if (error) {
+		goto out;
+	}
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * transaction goes to disk before returning to the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC) {
+		xfs_trans_set_sync(args.trans);
+	}
+
+	/*
+	 * Commit the last in the sequence of transactions.
+	 */
+	xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
+	error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES,
+				 NULL);
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+	/*
+	 * Hit the inode change time.
+	 */
+	if (!error && (flags & ATTR_KERNOTIME) == 0) {
+		xfs_ichgtime(dp, XFS_ICHGTIME_CHG);
+	}
+
+	return(error);
+
+out:
+	if (args.trans)
+		xfs_trans_cancel(args.trans,
+			XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	return(error);
+}
+
+/*
+ * Generate a list of extended attribute names and optionally
+ * also value lengths.	Positive return value follows the XFS
+ * convention of being an error, zero or negative return code
+ * is the length of the buffer returned (negated), indicating
+ * success.
+ */
+int
+xfs_attr_list(bhv_desc_t *bdp, char *buffer, int bufsize, int flags,
+		      attrlist_cursor_kern_t *cursor, struct cred *cred)
+{
+	xfs_attr_list_context_t context;
+	xfs_inode_t *dp;
+	int error;
+
+	XFS_STATS_INC(xfsstats.xs_attr_list);
+
+	/*
+	 * Validate the cursor.
+	 */
+	if (cursor->pad1 || cursor->pad2)
+		return(XFS_ERROR(EINVAL));
+	if ((cursor->initted == 0) &&
+	    (cursor->hashval || cursor->blkno || cursor->offset))
+		return(XFS_ERROR(EINVAL));
+
+	/*
+	 * Check for a properly aligned buffer.
+	 */
+	if (((long)buffer) & (sizeof(int)-1))
+		return(XFS_ERROR(EFAULT));
+	if (flags & ATTR_KERNOVAL)
+		bufsize = 0;
+
+	/*
+	 * Initialize the output buffer.
+	 */
+	context.dp = dp = XFS_BHVTOI(bdp);
+	context.cursor = cursor;
+	context.count = 0;
+	context.dupcnt = 0;
+	context.resynch = 1;
+	context.flags = flags;
+	if (!(flags & ATTR_KERNAMELS)) {
+		context.bufsize = (bufsize & ~(sizeof(int)-1));	 /* align */
+		context.firstu = context.bufsize;
+		context.alist = (attrlist_t *)buffer;
+		context.alist->al_count = 0;
+		context.alist->al_more = 0;
+		context.alist->al_offset[0] = context.bufsize;
+	}
+	else {
+		context.bufsize = bufsize;
+		context.firstu = context.bufsize;
+		context.alist = (attrlist_t *)buffer;
+	}
+
+	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+		return (EIO);
+	/*
+	 * Do they have permission?
+	 */
+	xfs_ilock(dp, XFS_ILOCK_SHARED);
+	if ((error = xfs_iaccess(dp, IREAD, cred))) {
+		xfs_iunlock(dp, XFS_ILOCK_SHARED);
+		return(XFS_ERROR(error));
+	}
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	xfs_attr_trace_l_c("syscall start", &context);
+	if (XFS_IFORK_Q(dp) == 0 ||
+	    (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+	     dp->i_d.di_anextents == 0)) {
+		error = 0;
+	} else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+		error = xfs_attr_shortform_list(&context);
+	} else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+		error = xfs_attr_leaf_list(&context);
+	} else {
+		error = xfs_attr_node_list(&context);
+	}
+	xfs_iunlock(dp, XFS_ILOCK_SHARED);
+	xfs_attr_trace_l_c("syscall end", &context);
+
+	if (!(context.flags & (ATTR_KERNOVAL|ATTR_KERNAMELS))) {
+		ASSERT(error >= 0);
+	}
+	else {	/* must return negated buffer size or the error */
+		if (context.count < 0)
+			error = XFS_ERROR(ERANGE);
+		else
+			error = -context.count;
+	}
+
+	return(error);
+}
+
+int								/* error */
+xfs_attr_inactive(xfs_inode_t *dp)
+{
+	xfs_trans_t *trans;
+	xfs_mount_t *mp;
+	int error;
+
+	mp = dp->i_mount;
+	ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
+
+	/* XXXsup - why on earth are we taking ILOCK_EXCL here??? */
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+	if ((XFS_IFORK_Q(dp) == 0) ||
+	    (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
+	    (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+	     dp->i_d.di_anextents == 0)) {
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+		return(0);
+	}
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+	/*
+	 * Start our first transaction of the day.
+	 *
+	 * All future transactions during this code must be "chained" off
+	 * this one via the trans_dup() call.  All transactions will contain
+	 * the inode, and the inode will always be marked with trans_ihold().
+	 * Since the inode will be locked in all transactions, we must log
+	 * the inode in every transaction to let it float upward through
+	 * the log.
+	 */
+	trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
+	if ((error = xfs_trans_reserve(trans, 0, XFS_ATTRINVAL_LOG_RES(mp), 0,
+				      XFS_TRANS_PERM_LOG_RES,
+				      XFS_ATTRINVAL_LOG_COUNT))) {
+		xfs_trans_cancel(trans, 0);
+		return(error);
+	}
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+	/*
+	 * No need to make quota reservations here. We expect to release some
+	 * blocks, not allocate, in the common case.
+	 */
+	xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
+	xfs_trans_ihold(trans, dp);
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if ((XFS_IFORK_Q(dp) == 0) ||
+	    (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
+	    (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+	     dp->i_d.di_anextents == 0)) {
+		error = 0;
+		goto out;
+	}
+	error = xfs_attr_root_inactive(&trans, dp);
+	if (error)
+		goto out;
+	/*
+	 * signal synchronous inactive transactions unless this
+	 * is a synchronous mount filesystem in which case we
+	 * know that we're here because we've been called out of
+	 * xfs_inactive which means that the last reference is gone
+	 * and the unlink transaction has already hit the disk so
+	 * async inactive transactions are safe.
+	 */
+	if ((error = xfs_itruncate_finish(&trans, dp, 0LL, XFS_ATTR_FORK,
+				(!(mp->m_flags & XFS_MOUNT_WSYNC)
+				 ? 1 : 0))))
+		goto out;
+
+	/*
+	 * Commit the last in the sequence of transactions.
+	 */
+	xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
+	error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES,
+				 NULL);
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+	return(error);
+
+out:
+	xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+	xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	return(error);
+}
+
+
+
+/*========================================================================
+ * External routines when attribute list is inside the inode
+ *========================================================================*/
+
+/*
+ * Add a name to the shortform attribute list structure
+ * This is the external routine.
+ */
+STATIC int
+xfs_attr_shortform_addname(xfs_da_args_t *args)
+{
+	int newsize, retval;
+
+	retval = xfs_attr_shortform_lookup(args);
+	if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+		return(retval);
+	} else if (retval == EEXIST) {
+		if (args->flags & ATTR_CREATE)
+			return(retval);
+		retval = xfs_attr_shortform_remove(args);
+		ASSERT(retval == 0);
+	}
+
+	newsize = XFS_ATTR_SF_TOTSIZE(args->dp);
+	newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+	if ((newsize <= XFS_IFORK_ASIZE(args->dp)) &&
+	    (args->namelen < XFS_ATTR_SF_ENTSIZE_MAX) &&
+	    (args->valuelen < XFS_ATTR_SF_ENTSIZE_MAX)) {
+		retval = xfs_attr_shortform_add(args);
+		ASSERT(retval == 0);
+	} else {
+		return(XFS_ERROR(ENOSPC));
+	}
+	return(0);
+}
+
+
+/*========================================================================
+ * External routines when attribute list is one block
+ *========================================================================*/
+
+/*
+ * Add a name to the leaf attribute list structure
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+int
+xfs_attr_leaf_addname(xfs_da_args_t *args)
+{
+	xfs_inode_t *dp;
+	xfs_dabuf_t *bp;
+	int retval, error, committed;
+
+	/*
+	 * Read the (only) block in the attribute list in.
+	 */
+	dp = args->dp;
+	args->blkno = 0;
+	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+					     XFS_ATTR_FORK);
+	if (error)
+		return(error);
+	ASSERT(bp != NULL);
+
+	/*
+	 * Look up the given attribute in the leaf block.  Figure out if
+	 * the given flags produce an error or call for an atomic rename.
+	 */
+	retval = xfs_attr_leaf_lookup_int(bp, args);
+	if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+		xfs_da_brelse(args->trans, bp);
+		return(retval);
+	} else if (retval == EEXIST) {
+		if (args->flags & ATTR_CREATE) {	/* pure create op */
+			xfs_da_brelse(args->trans, bp);
+			return(retval);
+		}
+		args->rename = 1;			/* an atomic rename */
+		args->blkno2 = args->blkno;		/* set 2nd entry info*/
+		args->index2 = args->index;
+		args->rmtblkno2 = args->rmtblkno;
+		args->rmtblkcnt2 = args->rmtblkcnt;
+	}
+
+	/*
+	 * Add the attribute to the leaf block, transitioning to a Btree
+	 * if required.
+	 */
+	retval = xfs_attr_leaf_add(bp, args);
+	xfs_da_buf_done(bp);
+	if (retval == ENOSPC) {
+		/*
+		 * Promote the attribute list to the Btree format, then
+		 * Commit that transaction so that the node_addname() call
+		 * can manage its own transactions.
+		 */
+		XFS_BMAP_INIT(args->flist, args->firstblock);
+		error = xfs_attr_leaf_to_node(args);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						*args->firstblock, &committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			return(error);
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed) {
+			xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+			xfs_trans_ihold(args->trans, dp);
+		}
+
+		/*
+		 * Commit the current trans (including the inode) and start
+		 * a new one.
+		 */
+		if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+			return (error);
+
+		/*
+		 * Fob the whole rest of the problem off on the Btree code.
+		 */
+		error = xfs_attr_node_addname(args);
+		return(error);
+	}
+
+	/*
+	 * Commit the transaction that added the attr name so that
+	 * later routines can manage their own transactions.
+	 */
+	if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+		return (error);
+
+	/*
+	 * If there was an out-of-line value, allocate the blocks we
+	 * identified for its storage and copy the value.  This is done
+	 * after we create the attribute so that we don't overflow the
+	 * maximum size of a transaction and/or hit a deadlock.
+	 */
+	if (args->rmtblkno > 0) {
+		error = xfs_attr_rmtval_set(args);
+		if (error)
+			return(error);
+	}
+
+	/*
+	 * If this is an atomic rename operation, we must "flip" the
+	 * incomplete flags on the "new" and "old" attribute/value pairs
+	 * so that one disappears and one appears atomically.  Then we
+	 * must remove the "old" attribute/value pair.
+	 */
+	if (args->rename) {
+		/*
+		 * In a separate transaction, set the incomplete flag on the
+		 * "old" attr and clear the incomplete flag on the "new" attr.
+		 */
+		error = xfs_attr_leaf_flipflags(args);
+		if (error)
+			return(error);
+
+		/*
+		 * Dismantle the "old" attribute/value pair by removing
+		 * a "remote" value (if it exists).
+		 */
+		args->index = args->index2;
+		args->blkno = args->blkno2;
+		args->rmtblkno = args->rmtblkno2;
+		args->rmtblkcnt = args->rmtblkcnt2;
+		if (args->rmtblkno) {
+			error = xfs_attr_rmtval_remove(args);
+			if (error)
+				return(error);
+		}
+
+		/*
+		 * Read in the block containing the "old" attr, then
+		 * remove the "old" attr from that block (neat, huh!)
+		 */
+		error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1,
+						     &bp, XFS_ATTR_FORK);
+		if (error)
+			return(error);
+		ASSERT(bp != NULL);
+		(void)xfs_attr_leaf_remove(bp, args);
+
+		/*
+		 * If the result is small enough, shrink it all into the inode.
+		 */
+		if (xfs_attr_shortform_allfit(bp, dp)) {
+			XFS_BMAP_INIT(args->flist, args->firstblock);
+			error = xfs_attr_leaf_to_shortform(bp, args);
+			/* bp is gone due to xfs_da_shrink_inode */
+			if (!error) {
+				error = xfs_bmap_finish(&args->trans,
+							args->flist,
+							*args->firstblock,
+							&committed);
+			}
+			if (error) {
+				ASSERT(committed);
+				args->trans = NULL;
+				xfs_bmap_cancel(args->flist);
+				return(error);
+			}
+
+			/*
+			 * bmap_finish() may have committed the last trans
+			 * and started a new one.  We need the inode to be
+			 * in all transactions.
+			 */
+			if (committed) {
+				xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+				xfs_trans_ihold(args->trans, dp);
+			}
+		} else
+			xfs_da_buf_done(bp);
+
+		/*
+		 * Commit the remove and start the next trans in series.
+		 */
+		error = xfs_attr_rolltrans(&args->trans, dp);
+
+	} else if (args->rmtblkno > 0) {
+		/*
+		 * Added a "remote" value, just clear the incomplete flag.
+		 */
+		error = xfs_attr_leaf_clearflag(args);
+	}
+	return(error);
+}
+
+/*
+ * Remove a name from the leaf attribute list structure
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+STATIC int
+xfs_attr_leaf_removename(xfs_da_args_t *args)
+{
+	xfs_inode_t *dp;
+	xfs_dabuf_t *bp;
+	int committed;
+	int error;
+
+	/*
+	 * Remove the attribute.
+	 */
+	dp = args->dp;
+	args->blkno = 0;
+	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+					     XFS_ATTR_FORK);
+	if (error) {
+		return(error);
+	}
+
+	ASSERT(bp != NULL);
+	error = xfs_attr_leaf_lookup_int(bp, args);
+	if (error == ENOATTR) {
+		xfs_da_brelse(args->trans, bp);
+		return(error);
+	}
+
+	(void)xfs_attr_leaf_remove(bp, args);
+
+	/*
+	 * If the result is small enough, shrink it all into the inode.
+	 */
+	if (xfs_attr_shortform_allfit(bp, dp)) {
+		XFS_BMAP_INIT(args->flist, args->firstblock);
+		error = xfs_attr_leaf_to_shortform(bp, args);
+		/* bp is gone due to xfs_da_shrink_inode */
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						*args->firstblock, &committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			return(error);
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed) {
+			xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+			xfs_trans_ihold(args->trans, dp);
+		}
+	} else
+		xfs_da_buf_done(bp);
+	return(0);
+}
+
+/*
+ * Look up a name in a leaf attribute list structure.
+ *
+ * This leaf block cannot have a "remote" value, we only call this routine
+ * if bmap_one_block() says there is only one block (ie: no remote blks).
+ */
+int
+xfs_attr_leaf_get(xfs_da_args_t *args)
+{
+	xfs_dabuf_t *bp;
+	int error;
+
+	args->blkno = 0;
+	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+					     XFS_ATTR_FORK);
+	if (error)
+		return(error);
+	ASSERT(bp != NULL);
+
+	error = xfs_attr_leaf_lookup_int(bp, args);
+	if (error != EEXIST)  {
+		xfs_da_brelse(args->trans, bp);
+		return(error);
+	}
+	error = xfs_attr_leaf_getvalue(bp, args);
+	xfs_da_brelse(args->trans, bp);
+	if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) {
+		error = xfs_attr_rmtval_get(args);
+	}
+	return(error);
+}
+
+/*
+ * Copy out attribute entries for attr_list(), for leaf attribute lists.
+ */
+STATIC int
+xfs_attr_leaf_list(xfs_attr_list_context_t *context)
+{
+	xfs_attr_leafblock_t *leaf;
+	int error;
+	xfs_dabuf_t *bp;
+
+	context->cursor->blkno = 0;
+	error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK);
+	if (error)
+		return(error);
+	ASSERT(bp != NULL);
+	leaf = bp->data;
+	if (INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						!= XFS_ATTR_LEAF_MAGIC) {
+		xfs_da_brelse(NULL, bp);
+		return(XFS_ERROR(EFSCORRUPTED));
+	}
+
+	(void)xfs_attr_leaf_list_int(bp, context);
+	xfs_da_brelse(NULL, bp);
+	return(0);
+}
+
+
+/*========================================================================
+ * External routines when attribute list size > XFS_LBSIZE(mp).
+ *========================================================================*/
+
+/*
+ * Add a name to a Btree-format attribute list.
+ *
+ * This will involve walking down the Btree, and may involve splitting
+ * leaf nodes and even splitting intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ *
+ * "Remote" attribute values confuse the issue and atomic rename operations
+ * add a whole extra layer of confusion on top of that.
+ */
+STATIC int
+xfs_attr_node_addname(xfs_da_args_t *args)
+{
+	xfs_da_state_t *state;
+	xfs_da_state_blk_t *blk;
+	xfs_inode_t *dp;
+	xfs_mount_t *mp;
+	int committed, retval, error;
+
+	/*
+	 * Fill in bucket of arguments/results/context to carry around.
+	 */
+	dp = args->dp;
+	mp = dp->i_mount;
+restart:
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = mp;
+	state->blocksize = state->mp->m_sb.sb_blocksize;
+
+	/*
+	 * Search to see if name already exists, and get back a pointer
+	 * to where it should go.
+	 */
+	error = xfs_da_node_lookup_int(state, &retval);
+	if (error)
+		goto out;
+	blk = &state->path.blk[ state->path.active-1 ];
+	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+	if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) {
+		goto out;
+	} else if (retval == EEXIST) {
+		if (args->flags & ATTR_CREATE)
+			goto out;
+		args->rename = 1;			/* atomic rename op */
+		args->blkno2 = args->blkno;		/* set 2nd entry info*/
+		args->index2 = args->index;
+		args->rmtblkno2 = args->rmtblkno;
+		args->rmtblkcnt2 = args->rmtblkcnt;
+		args->rmtblkno = 0;
+		args->rmtblkcnt = 0;
+	}
+
+	retval = xfs_attr_leaf_add(blk->bp, state->args);
+	if (retval == ENOSPC) {
+		if (state->path.active == 1) {
+			/*
+			 * Its really a single leaf node, but it had
+			 * out-of-line values so it looked like it *might*
+			 * have been a b-tree.
+			 */
+			xfs_da_state_free(state);
+			XFS_BMAP_INIT(args->flist, args->firstblock);
+			error = xfs_attr_leaf_to_node(args);
+			if (!error) {
+				error = xfs_bmap_finish(&args->trans,
+							args->flist,
+							*args->firstblock,
+							&committed);
+			}
+			if (error) {
+				ASSERT(committed);
+				args->trans = NULL;
+				xfs_bmap_cancel(args->flist);
+				goto out;
+			}
+
+			/*
+			 * bmap_finish() may have committed the last trans
+			 * and started a new one.  We need the inode to be
+			 * in all transactions.
+			 */
+			if (committed) {
+				xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+				xfs_trans_ihold(args->trans, dp);
+			}
+
+			/*
+			 * Commit the node conversion and start the next
+			 * trans in the chain.
+			 */
+			if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+				goto out;
+
+			goto restart;
+		}
+
+		/*
+		 * Split as many Btree elements as required.
+		 * This code tracks the new and old attr's location
+		 * in the index/blkno/rmtblkno/rmtblkcnt fields and
+		 * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
+		 */
+		XFS_BMAP_INIT(args->flist, args->firstblock);
+		error = xfs_da_split(state);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						*args->firstblock, &committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			goto out;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed) {
+			xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+			xfs_trans_ihold(args->trans, dp);
+		}
+	} else {
+		/*
+		 * Addition succeeded, update Btree hashvals.
+		 */
+		xfs_da_fixhashpath(state, &state->path);
+	}
+
+	/*
+	 * Kill the state structure, we're done with it and need to
+	 * allow the buffers to come back later.
+	 */
+	xfs_da_state_free(state);
+	state = NULL;
+
+	/*
+	 * Commit the leaf addition or btree split and start the next
+	 * trans in the chain.
+	 */
+	if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+		goto out;
+
+	/*
+	 * If there was an out-of-line value, allocate the blocks we
+	 * identified for its storage and copy the value.  This is done
+	 * after we create the attribute so that we don't overflow the
+	 * maximum size of a transaction and/or hit a deadlock.
+	 */
+	if (args->rmtblkno > 0) {
+		error = xfs_attr_rmtval_set(args);
+		if (error)
+			return(error);
+	}
+
+	/*
+	 * If this is an atomic rename operation, we must "flip" the
+	 * incomplete flags on the "new" and "old" attribute/value pairs
+	 * so that one disappears and one appears atomically.  Then we
+	 * must remove the "old" attribute/value pair.
+	 */
+	if (args->rename) {
+		/*
+		 * In a separate transaction, set the incomplete flag on the
+		 * "old" attr and clear the incomplete flag on the "new" attr.
+		 */
+		error = xfs_attr_leaf_flipflags(args);
+		if (error)
+			goto out;
+
+		/*
+		 * Dismantle the "old" attribute/value pair by removing
+		 * a "remote" value (if it exists).
+		 */
+		args->index = args->index2;
+		args->blkno = args->blkno2;
+		args->rmtblkno = args->rmtblkno2;
+		args->rmtblkcnt = args->rmtblkcnt2;
+		if (args->rmtblkno) {
+			error = xfs_attr_rmtval_remove(args);
+			if (error)
+				return(error);
+		}
+
+		/*
+		 * Re-find the "old" attribute entry after any split ops.
+		 * The INCOMPLETE flag means that we will find the "old"
+		 * attr, not the "new" one.
+		 */
+		args->flags |= XFS_ATTR_INCOMPLETE;
+		state = xfs_da_state_alloc();
+		state->args = args;
+		state->mp = mp;
+		state->blocksize = state->mp->m_sb.sb_blocksize;
+		state->inleaf = 0;
+		error = xfs_da_node_lookup_int(state, &retval);
+		if (error)
+			goto out;
+
+		/*
+		 * Remove the name and update the hashvals in the tree.
+		 */
+		blk = &state->path.blk[ state->path.active-1 ];
+		ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+		error = xfs_attr_leaf_remove(blk->bp, args);
+		xfs_da_fixhashpath(state, &state->path);
+
+		/*
+		 * Check to see if the tree needs to be collapsed.
+		 */
+		if (retval && (state->path.active > 1)) {
+			XFS_BMAP_INIT(args->flist, args->firstblock);
+			error = xfs_da_join(state);
+			if (!error) {
+				error = xfs_bmap_finish(&args->trans,
+							args->flist,
+							*args->firstblock,
+							&committed);
+			}
+			if (error) {
+				ASSERT(committed);
+				args->trans = NULL;
+				xfs_bmap_cancel(args->flist);
+				goto out;
+			}
+
+			/*
+			 * bmap_finish() may have committed the last trans
+			 * and started a new one.  We need the inode to be
+			 * in all transactions.
+			 */
+			if (committed) {
+				xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+				xfs_trans_ihold(args->trans, dp);
+			}
+		}
+
+		/*
+		 * Commit and start the next trans in the chain.
+		 */
+		if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+			goto out;
+
+	} else if (args->rmtblkno > 0) {
+		/*
+		 * Added a "remote" value, just clear the incomplete flag.
+		 */
+		error = xfs_attr_leaf_clearflag(args);
+		if (error)
+			goto out;
+	}
+	retval = error = 0;
+
+out:
+	if (state)
+		xfs_da_state_free(state);
+	if (error)
+		return(error);
+	return(retval);
+}
+
+/*
+ * Remove a name from a B-tree attribute list.
+ *
+ * This will involve walking down the Btree, and may involve joining
+ * leaf nodes and even joining intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ */
+STATIC int
+xfs_attr_node_removename(xfs_da_args_t *args)
+{
+	xfs_da_state_t *state;
+	xfs_da_state_blk_t *blk;
+	xfs_inode_t *dp;
+	xfs_dabuf_t *bp;
+	int retval, error, committed;
+
+	/*
+	 * Tie a string around our finger to remind us where we are.
+	 */
+	dp = args->dp;
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = dp->i_mount;
+	state->blocksize = state->mp->m_sb.sb_blocksize;
+
+	/*
+	 * Search to see if name exists, and get back a pointer to it.
+	 */
+	error = xfs_da_node_lookup_int(state, &retval);
+	if (error || (retval != EEXIST)) {
+		if (error == 0)
+			error = retval;
+		goto out;
+	}
+
+	/*
+	 * If there is an out-of-line value, de-allocate the blocks.
+	 * This is done before we remove the attribute so that we don't
+	 * overflow the maximum size of a transaction and/or hit a deadlock.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	ASSERT(blk->bp != NULL);
+	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+	if (args->rmtblkno > 0) {
+		/*
+		 * Fill in disk block numbers in the state structure
+		 * so that we can get the buffers back after we commit
+		 * several transactions in the following calls.
+		 */
+		error = xfs_attr_fillstate(state);
+		if (error)
+			goto out;
+
+		/*
+		 * Mark the attribute as INCOMPLETE, then bunmapi() the
+		 * remote value.
+		 */
+		error = xfs_attr_leaf_setflag(args);
+		if (error)
+			goto out;
+		error = xfs_attr_rmtval_remove(args);
+		if (error)
+			goto out;
+
+		/*
+		 * Refill the state structure with buffers, the prior calls
+		 * released our buffers.
+		 */
+		error = xfs_attr_refillstate(state);
+		if (error)
+			goto out;
+	}
+
+	/*
+	 * Remove the name and update the hashvals in the tree.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+	retval = xfs_attr_leaf_remove(blk->bp, args);
+	xfs_da_fixhashpath(state, &state->path);
+
+	/*
+	 * Check to see if the tree needs to be collapsed.
+	 */
+	if (retval && (state->path.active > 1)) {
+		XFS_BMAP_INIT(args->flist, args->firstblock);
+		error = xfs_da_join(state);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						*args->firstblock, &committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			goto out;
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed) {
+			xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+			xfs_trans_ihold(args->trans, dp);
+		}
+
+		/*
+		 * Commit the Btree join operation and start a new trans.
+		 */
+		if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+			goto out;
+	}
+
+	/*
+	 * If the result is small enough, push it all into the inode.
+	 */
+	if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+		/*
+		 * Have to get rid of the copy of this dabuf in the state.
+		 */
+		ASSERT(state->path.active == 1);
+		ASSERT(state->path.blk[0].bp);
+		xfs_da_buf_done(state->path.blk[0].bp);
+		state->path.blk[0].bp = NULL;
+
+		error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
+						     XFS_ATTR_FORK);
+		if (error)
+			goto out;
+		ASSERT(INT_GET(((xfs_attr_leafblock_t *)
+				      bp->data)->hdr.info.magic, ARCH_CONVERT)
+						       == XFS_ATTR_LEAF_MAGIC);
+
+		if (xfs_attr_shortform_allfit(bp, dp)) {
+			XFS_BMAP_INIT(args->flist, args->firstblock);
+			error = xfs_attr_leaf_to_shortform(bp, args);
+			/* bp is gone due to xfs_da_shrink_inode */
+			if (!error) {
+				error = xfs_bmap_finish(&args->trans,
+							args->flist,
+							*args->firstblock,
+							&committed);
+			}
+			if (error) {
+				ASSERT(committed);
+				args->trans = NULL;
+				xfs_bmap_cancel(args->flist);
+				goto out;
+			}
+
+			/*
+			 * bmap_finish() may have committed the last trans
+			 * and started a new one.  We need the inode to be
+			 * in all transactions.
+			 */
+			if (committed) {
+				xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+				xfs_trans_ihold(args->trans, dp);
+			}
+		} else
+			xfs_da_brelse(args->trans, bp);
+	}
+	error = 0;
+
+out:
+	xfs_da_state_free(state);
+	return(error);
+}
+
+/*
+ * Fill in the disk block numbers in the state structure for the buffers
+ * that are attached to the state structure.
+ * This is done so that we can quickly reattach ourselves to those buffers
+ * after some set of transaction commit's has released these buffers.
+ */
+STATIC int
+xfs_attr_fillstate(xfs_da_state_t *state)
+{
+	xfs_da_state_path_t *path;
+	xfs_da_state_blk_t *blk;
+	int level;
+
+	/*
+	 * Roll down the "path" in the state structure, storing the on-disk
+	 * block number for those buffers in the "path".
+	 */
+	path = &state->path;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->bp) {
+			blk->disk_blkno = xfs_da_blkno(blk->bp);
+			xfs_da_buf_done(blk->bp);
+			blk->bp = NULL;
+		} else {
+			blk->disk_blkno = 0;
+		}
+	}
+
+	/*
+	 * Roll down the "altpath" in the state structure, storing the on-disk
+	 * block number for those buffers in the "altpath".
+	 */
+	path = &state->altpath;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->bp) {
+			blk->disk_blkno = xfs_da_blkno(blk->bp);
+			xfs_da_buf_done(blk->bp);
+			blk->bp = NULL;
+		} else {
+			blk->disk_blkno = 0;
+		}
+	}
+
+	return(0);
+}
+
+/*
+ * Reattach the buffers to the state structure based on the disk block
+ * numbers stored in the state structure.
+ * This is done after some set of transaction commit's has released those
+ * buffers from our grip.
+ */
+STATIC int
+xfs_attr_refillstate(xfs_da_state_t *state)
+{
+	xfs_da_state_path_t *path;
+	xfs_da_state_blk_t *blk;
+	int level, error;
+
+	/*
+	 * Roll down the "path" in the state structure, storing the on-disk
+	 * block number for those buffers in the "path".
+	 */
+	path = &state->path;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->disk_blkno) {
+			error = xfs_da_read_buf(state->args->trans,
+						state->args->dp,
+						blk->blkno, blk->disk_blkno,
+						&blk->bp, XFS_ATTR_FORK);
+			if (error)
+				return(error);
+		} else {
+			blk->bp = NULL;
+		}
+	}
+
+	/*
+	 * Roll down the "altpath" in the state structure, storing the on-disk
+	 * block number for those buffers in the "altpath".
+	 */
+	path = &state->altpath;
+	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+		if (blk->disk_blkno) {
+			error = xfs_da_read_buf(state->args->trans,
+						state->args->dp,
+						blk->blkno, blk->disk_blkno,
+						&blk->bp, XFS_ATTR_FORK);
+			if (error)
+				return(error);
+		} else {
+			blk->bp = NULL;
+		}
+	}
+
+	return(0);
+}
+
+/*
+ * Look up a filename in a node attribute list.
+ *
+ * This routine gets called for any attribute fork that has more than one
+ * block, ie: both true Btree attr lists and for single-leaf-blocks with
+ * "remote" values taking up more blocks.
+ */
+int
+xfs_attr_node_get(xfs_da_args_t *args)
+{
+	xfs_da_state_t *state;
+	xfs_da_state_blk_t *blk;
+	int error, retval;
+	int i;
+
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	state->blocksize = state->mp->m_sb.sb_blocksize;
+
+	/*
+	 * Search to see if name exists, and get back a pointer to it.
+	 */
+	error = xfs_da_node_lookup_int(state, &retval);
+	if (error) {
+		retval = error;
+	} else if (retval == EEXIST) {
+		blk = &state->path.blk[ state->path.active-1 ];
+		ASSERT(blk->bp != NULL);
+		ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+
+		/*
+		 * Get the value, local or "remote"
+		 */
+		retval = xfs_attr_leaf_getvalue(blk->bp, args);
+		if (!retval && (args->rmtblkno > 0)
+		    && !(args->flags & ATTR_KERNOVAL)) {
+			retval = xfs_attr_rmtval_get(args);
+		}
+	}
+
+	/*
+	 * If not in a transaction, we have to release all the buffers.
+	 */
+	for (i = 0; i < state->path.active; i++) {
+		xfs_da_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+
+	xfs_da_state_free(state);
+	return(retval);
+}
+
+STATIC int							/* error */
+xfs_attr_node_list(xfs_attr_list_context_t *context)
+{
+	attrlist_cursor_kern_t *cursor;
+	xfs_attr_leafblock_t *leaf;
+	xfs_da_intnode_t *node;
+	xfs_da_node_entry_t *btree;
+	int error, i;
+	xfs_dabuf_t *bp;
+
+	cursor = context->cursor;
+	cursor->initted = 1;
+
+	/*
+	 * Do all sorts of validation on the passed-in cursor structure.
+	 * If anything is amiss, ignore the cursor and look up the hashval
+	 * starting from the btree root.
+	 */
+	bp = NULL;
+	if (cursor->blkno > 0) {
+		error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
+					      &bp, XFS_ATTR_FORK);
+		if ((error != 0) && (error != EFSCORRUPTED))
+			return(error);
+		if (bp) {
+			node = bp->data;
+			switch (INT_GET(node->hdr.info.magic, ARCH_CONVERT)) {
+			case XFS_DA_NODE_MAGIC:
+				xfs_attr_trace_l_cn("wrong blk", context, node);
+				xfs_da_brelse(NULL, bp);
+				bp = NULL;
+				break;
+			case XFS_ATTR_LEAF_MAGIC:
+				leaf = bp->data;
+				if (cursor->hashval >
+				    INT_GET(leaf->entries[
+					 INT_GET(leaf->hdr.count,
+						ARCH_CONVERT)-1].hashval,
+							ARCH_CONVERT)) {
+					xfs_attr_trace_l_cl("wrong blk",
+							   context, leaf);
+					xfs_da_brelse(NULL, bp);
+					bp = NULL;
+				} else if (cursor->hashval <=
+					     INT_GET(leaf->entries[0].hashval,
+							ARCH_CONVERT)) {
+					xfs_attr_trace_l_cl("maybe wrong blk",
+							   context, leaf);
+					xfs_da_brelse(NULL, bp);
+					bp = NULL;
+				}
+				break;
+			default:
+				xfs_attr_trace_l_c("wrong blk - ??", context);
+				xfs_da_brelse(NULL, bp);
+				bp = NULL;
+			}
+		}
+	}
+
+	/*
+	 * We did not find what we expected given the cursor's contents,
+	 * so we start from the top and work down based on the hash value.
+	 * Note that start of node block is same as start of leaf block.
+	 */
+	if (bp == NULL) {
+		cursor->blkno = 0;
+		for (;;) {
+			error = xfs_da_read_buf(NULL, context->dp,
+						      cursor->blkno, -1, &bp,
+						      XFS_ATTR_FORK);
+			if (error)
+				return(error);
+			if (bp == NULL)
+				return(XFS_ERROR(EFSCORRUPTED));
+			node = bp->data;
+			if (INT_GET(node->hdr.info.magic, ARCH_CONVERT)
+							== XFS_ATTR_LEAF_MAGIC)
+				break;
+			if (INT_GET(node->hdr.info.magic, ARCH_CONVERT)
+							!= XFS_DA_NODE_MAGIC) {
+				xfs_da_brelse(NULL, bp);
+				return(XFS_ERROR(EFSCORRUPTED));
+			}
+			btree = node->btree;
+			for (i = 0;
+				i < INT_GET(node->hdr.count, ARCH_CONVERT);
+								btree++, i++) {
+				if (cursor->hashval
+						<= INT_GET(btree->hashval,
+							    ARCH_CONVERT)) {
+					cursor->blkno = INT_GET(btree->before, ARCH_CONVERT);
+					xfs_attr_trace_l_cb("descending",
+							    context, btree);
+					break;
+				}
+			}
+			if (i == INT_GET(node->hdr.count, ARCH_CONVERT)) {
+				xfs_da_brelse(NULL, bp);
+				return(0);
+			}
+			xfs_da_brelse(NULL, bp);
+		}
+	}
+	ASSERT(bp != NULL);
+
+	/*
+	 * Roll upward through the blocks, processing each leaf block in
+	 * order.  As long as there is space in the result buffer, keep
+	 * adding the information.
+	 */
+	for (;;) {
+		leaf = bp->data;
+		if (INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						!= XFS_ATTR_LEAF_MAGIC) {
+			xfs_da_brelse(NULL, bp);
+			return(XFS_ERROR(EFSCORRUPTED));
+		}
+		error = xfs_attr_leaf_list_int(bp, context);
+		if (error || (INT_ISZERO(leaf->hdr.info.forw, ARCH_CONVERT)))
+			break;	/* not really an error, buffer full or EOF */
+		cursor->blkno = INT_GET(leaf->hdr.info.forw, ARCH_CONVERT);
+		xfs_da_brelse(NULL, bp);
+		error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
+					      &bp, XFS_ATTR_FORK);
+		if (error)
+			return(error);
+		if (bp == NULL)
+			return(XFS_ERROR(EFSCORRUPTED));
+	}
+	xfs_da_brelse(NULL, bp);
+	return(0);
+}
+
+
+/*========================================================================
+ * External routines for manipulating out-of-line attribute values.
+ *========================================================================*/
+
+/*
+ * Read the value associated with an attribute from the out-of-line buffer
+ * that we stored it in.
+ */
+STATIC int
+xfs_attr_rmtval_get(xfs_da_args_t *args)
+{
+	xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE];
+	xfs_mount_t *mp;
+	xfs_daddr_t dblkno;
+	xfs_caddr_t dst;
+	xfs_buf_t *bp;
+	int nmap, error, tmp, valuelen, blkcnt, i;
+	xfs_dablk_t lblkno;
+
+	ASSERT(!(args->flags & ATTR_KERNOVAL));
+
+	mp = args->dp->i_mount;
+	dst = args->value;
+	valuelen = args->valuelen;
+	lblkno = args->rmtblkno;
+	while (valuelen > 0) {
+		nmap = ATTR_RMTVALUE_MAPSIZE;
+		error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno,
+				  args->rmtblkcnt,
+				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+				  NULL, 0, map, &nmap, NULL);
+		if (error)
+			return(error);
+		ASSERT(nmap >= 1);
+
+		for (i = 0; (i < nmap) && (valuelen > 0); i++) {
+			ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) &&
+			       (map[i].br_startblock != HOLESTARTBLOCK));
+			dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
+			blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+			error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
+					     blkcnt, XFS_BUF_LOCK, &bp);
+			if (error)
+				return(error);
+
+			tmp = (valuelen < XFS_BUF_SIZE(bp))
+				? valuelen : XFS_BUF_SIZE(bp);
+			xfs_biomove(bp, 0, tmp, dst, XFS_B_READ);
+			xfs_buf_relse(bp);
+			dst += tmp;
+			valuelen -= tmp;
+
+			lblkno += map[i].br_blockcount;
+		}
+	}
+	ASSERT(valuelen == 0);
+	return(0);
+}
+
+/*
+ * Write the value associated with an attribute into the out-of-line buffer
+ * that we have defined for it.
+ */
+STATIC int
+xfs_attr_rmtval_set(xfs_da_args_t *args)
+{
+	xfs_mount_t *mp;
+	xfs_fileoff_t lfileoff;
+	xfs_inode_t *dp;
+	xfs_bmbt_irec_t map;
+	xfs_daddr_t dblkno;
+	xfs_caddr_t src;
+	xfs_buf_t *bp;
+	xfs_dablk_t lblkno;
+	int blkcnt, valuelen, nmap, error, tmp, committed;
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	src = args->value;
+
+	/*
+	 * Find a "hole" in the attribute address space large enough for
+	 * us to drop the new attribute's value into.
+	 */
+	blkcnt = XFS_B_TO_FSB(mp, args->valuelen);
+	lfileoff = 0;
+	error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
+						   XFS_ATTR_FORK);
+	if (error) {
+		return(error);
+	}
+	args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff;
+	args->rmtblkcnt = blkcnt;
+
+	/*
+	 * Roll through the "value", allocating blocks on disk as required.
+	 */
+	while (blkcnt > 0) {
+		/*
+		 * Allocate a single extent, up to the size of the value.
+		 */
+		XFS_BMAP_INIT(args->flist, args->firstblock);
+		nmap = 1;
+		error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno,
+				  blkcnt,
+				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA |
+							XFS_BMAPI_WRITE,
+				  args->firstblock, args->total, &map, &nmap,
+				  args->flist);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						*args->firstblock, &committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			return(error);
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed) {
+			xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL);
+			xfs_trans_ihold(args->trans, dp);
+		}
+
+		ASSERT(nmap == 1);
+		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+		       (map.br_startblock != HOLESTARTBLOCK));
+		lblkno += map.br_blockcount;
+		blkcnt -= map.br_blockcount;
+
+		/*
+		 * Start the next trans in the chain.
+		 */
+		if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+			return (error);
+	}
+
+	/*
+	 * Roll through the "value", copying the attribute value to the
+	 * already-allocated blocks.  Blocks are written synchronously
+	 * so that we can know they are all on disk before we turn off
+	 * the INCOMPLETE flag.
+	 */
+	lblkno = args->rmtblkno;
+	valuelen = args->valuelen;
+	while (valuelen > 0) {
+		/*
+		 * Try to remember where we decided to put the value.
+		 */
+		XFS_BMAP_INIT(args->flist, args->firstblock);
+		nmap = 1;
+		error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno,
+				  args->rmtblkcnt,
+				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+				  args->firstblock, 0, &map, &nmap, NULL);
+		if (error) {
+			return(error);
+		}
+		ASSERT(nmap == 1);
+		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+		       (map.br_startblock != HOLESTARTBLOCK));
+
+		dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+		blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+		bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno,
+							blkcnt, XFS_BUF_LOCK);
+		ASSERT(bp);
+		ASSERT(!XFS_BUF_GETERROR(bp));
+
+		tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen :
+							XFS_BUF_SIZE(bp);
+		xfs_biomove(bp, 0, tmp, src, XFS_B_WRITE);
+		if (tmp < XFS_BUF_SIZE(bp))
+			xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp);
+		if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */
+			return (error);
+		}
+		src += tmp;
+		valuelen -= tmp;
+
+		lblkno += map.br_blockcount;
+	}
+	ASSERT(valuelen == 0);
+	return(0);
+}
+
+/*
+ * Remove the value associated with an attribute by deleting the
+ * out-of-line buffer that it is stored on.
+ */
+STATIC int
+xfs_attr_rmtval_remove(xfs_da_args_t *args)
+{
+	xfs_mount_t *mp;
+	xfs_bmbt_irec_t map;
+	xfs_buf_t *bp;
+	xfs_daddr_t dblkno;
+	xfs_dablk_t lblkno;
+	int valuelen, blkcnt, nmap, error, done, committed;
+
+	mp = args->dp->i_mount;
+
+	/*
+	 * Roll through the "value", invalidating the attribute value's
+	 * blocks.
+	 */
+	lblkno = args->rmtblkno;
+	valuelen = args->rmtblkcnt;
+	while (valuelen > 0) {
+		/*
+		 * Try to remember where we decided to put the value.
+		 */
+		XFS_BMAP_INIT(args->flist, args->firstblock);
+		nmap = 1;
+		error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno,
+					args->rmtblkcnt,
+					XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+					args->firstblock, 0, &map, &nmap,
+					args->flist);
+		if (error) {
+			return(error);
+		}
+		ASSERT(nmap == 1);
+		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+		       (map.br_startblock != HOLESTARTBLOCK));
+
+		dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
+		blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
+
+		/*
+		 * If the "remote" value is in the cache, remove it.
+		 */
+		/* bp = incore(mp->m_dev, dblkno, blkcnt, 1); */
+		bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, 1);
+		if (bp) {
+			XFS_BUF_STALE(bp);
+			XFS_BUF_UNDELAYWRITE(bp);
+			xfs_buf_relse(bp);
+			bp = NULL;
+		}
+
+		valuelen -= map.br_blockcount;
+
+		lblkno += map.br_blockcount;
+	}
+
+	/*
+	 * Keep de-allocating extents until the remote-value region is gone.
+	 */
+	lblkno = args->rmtblkno;
+	blkcnt = args->rmtblkcnt;
+	done = 0;
+	while (!done) {
+		XFS_BMAP_INIT(args->flist, args->firstblock);
+		error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
+				    XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+				    1, args->firstblock, args->flist, &done);
+		if (!error) {
+			error = xfs_bmap_finish(&args->trans, args->flist,
+						*args->firstblock, &committed);
+		}
+		if (error) {
+			ASSERT(committed);
+			args->trans = NULL;
+			xfs_bmap_cancel(args->flist);
+			return(error);
+		}
+
+		/*
+		 * bmap_finish() may have committed the last trans and started
+		 * a new one.  We need the inode to be in all transactions.
+		 */
+		if (committed) {
+			xfs_trans_ijoin(args->trans, args->dp, XFS_ILOCK_EXCL);
+			xfs_trans_ihold(args->trans, args->dp);
+		}
+
+		/*
+		 * Close out trans and start the next one in the chain.
+		 */
+		if ((error = xfs_attr_rolltrans(&args->trans, args->dp)))
+			return (error);
+	}
+	return(0);
+}
+
+#if defined(XFS_ATTR_TRACE)
+/*
+ * Add a trace buffer entry for an attr_list context structure.
+ */
+void
+xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context)
+{
+	xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_C, where,
+		(__psunsigned_t)context->dp,
+		(__psunsigned_t)context->cursor->hashval,
+		(__psunsigned_t)context->cursor->blkno,
+		(__psunsigned_t)context->cursor->offset,
+		(__psunsigned_t)context->alist,
+		(__psunsigned_t)context->bufsize,
+		(__psunsigned_t)context->count,
+		(__psunsigned_t)context->firstu,
+		(__psunsigned_t)
+			(context->count > 0)
+				? (ATTR_ENTRY(context->alist,
+					      context->count-1)->a_valuelen)
+				: 0,
+		(__psunsigned_t)context->dupcnt,
+		(__psunsigned_t)context->flags,
+		(__psunsigned_t)NULL,
+		(__psunsigned_t)NULL,
+		(__psunsigned_t)NULL);
+}
+
+/*
+ * Add a trace buffer entry for a context structure and a Btree node.
+ */
+void
+xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context,
+			 struct xfs_da_intnode *node)
+{
+	xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CN, where,
+		(__psunsigned_t)context->dp,
+		(__psunsigned_t)context->cursor->hashval,
+		(__psunsigned_t)context->cursor->blkno,
+		(__psunsigned_t)context->cursor->offset,
+		(__psunsigned_t)context->alist,
+		(__psunsigned_t)context->bufsize,
+		(__psunsigned_t)context->count,
+		(__psunsigned_t)context->firstu,
+		(__psunsigned_t)
+			(context->count > 0)
+				? (ATTR_ENTRY(context->alist,
+					      context->count-1)->a_valuelen)
+				: 0,
+		(__psunsigned_t)context->dupcnt,
+		(__psunsigned_t)context->flags,
+		(__psunsigned_t)INT_GET(node->hdr.count, ARCH_CONVERT),
+		(__psunsigned_t)INT_GET(node->btree[0].hashval, ARCH_CONVERT),
+		(__psunsigned_t)INT_GET(node->btree[INT_GET(node->hdr.count, ARCH_CONVERT)-1].hashval, ARCH_CONVERT));
+}
+
+/*
+ * Add a trace buffer entry for a context structure and a Btree element.
+ */
+void
+xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context,
+			  struct xfs_da_node_entry *btree)
+{
+	xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CB, where,
+		(__psunsigned_t)context->dp,
+		(__psunsigned_t)context->cursor->hashval,
+		(__psunsigned_t)context->cursor->blkno,
+		(__psunsigned_t)context->cursor->offset,
+		(__psunsigned_t)context->alist,
+		(__psunsigned_t)context->bufsize,
+		(__psunsigned_t)context->count,
+		(__psunsigned_t)context->firstu,
+		(__psunsigned_t)
+			(context->count > 0)
+				? (ATTR_ENTRY(context->alist,
+					      context->count-1)->a_valuelen)
+				: 0,
+		(__psunsigned_t)context->dupcnt,
+		(__psunsigned_t)context->flags,
+		(__psunsigned_t)INT_GET(btree->hashval, ARCH_CONVERT),
+		(__psunsigned_t)INT_GET(btree->before, ARCH_CONVERT),
+		(__psunsigned_t)NULL);
+}
+
+/*
+ * Add a trace buffer entry for a context structure and a leaf block.
+ */
+void
+xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
+			      struct xfs_attr_leafblock *leaf)
+{
+	xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CL, where,
+		(__psunsigned_t)context->dp,
+		(__psunsigned_t)context->cursor->hashval,
+		(__psunsigned_t)context->cursor->blkno,
+		(__psunsigned_t)context->cursor->offset,
+		(__psunsigned_t)context->alist,
+		(__psunsigned_t)context->bufsize,
+		(__psunsigned_t)context->count,
+		(__psunsigned_t)context->firstu,
+		(__psunsigned_t)
+			(context->count > 0)
+				? (ATTR_ENTRY(context->alist,
+					      context->count-1)->a_valuelen)
+				: 0,
+		(__psunsigned_t)context->dupcnt,
+		(__psunsigned_t)context->flags,
+		(__psunsigned_t)INT_GET(leaf->hdr.count, ARCH_CONVERT),
+		(__psunsigned_t)INT_GET(leaf->entries[0].hashval, ARCH_CONVERT),
+		(__psunsigned_t)INT_GET(leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT)-1].hashval, ARCH_CONVERT));
+}
+
+/*
+ * Add a trace buffer entry for the arguments given to the routine,
+ * generic form.
+ */
+void
+xfs_attr_trace_enter(int type, char *where,
+			 __psunsigned_t a2, __psunsigned_t a3,
+			 __psunsigned_t a4, __psunsigned_t a5,
+			 __psunsigned_t a6, __psunsigned_t a7,
+			 __psunsigned_t a8, __psunsigned_t a9,
+			 __psunsigned_t a10, __psunsigned_t a11,
+			 __psunsigned_t a12, __psunsigned_t a13,
+			 __psunsigned_t a14, __psunsigned_t a15)
+{
+	ASSERT(xfs_attr_trace_buf);
+	ktrace_enter(xfs_attr_trace_buf, (void *)((__psunsigned_t)type),
+					 (void *)where,
+					 (void *)a2,  (void *)a3,  (void *)a4,
+					 (void *)a5,  (void *)a6,  (void *)a7,
+					 (void *)a8,  (void *)a9,  (void *)a10,
+					 (void *)a11, (void *)a12, (void *)a13,
+					 (void *)a14, (void *)a15);
+}
+#endif	/* XFS_ATTR_TRACE */
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
new file mode 100644
index 000000000000..0184e7b72cab
--- /dev/null
+++ b/fs/xfs/xfs_attr.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ATTR_H__
+#define __XFS_ATTR_H__
+
+/*
+ * xfs_attr.h
+ *
+ * Large attribute lists are structured around Btrees where all the data
+ * elements are in the leaf nodes.  Attribute names are hashed into an int,
+ * then that int is used as the index into the Btree.  Since the hashval
+ * of an attribute name may not be unique, we may have duplicate keys.
+ * The internal links in the Btree are logical block offsets into the file.
+ *
+ * Small attribute lists use a different format and are packed as tightly
+ * as possible so as to fit into the literal area of the inode.
+ */
+
+#ifdef XFS_ALL_TRACE
+#define XFS_ATTR_TRACE
+#endif
+
+#if !defined(DEBUG)
+#undef XFS_ATTR_TRACE
+#endif
+
+
+/*========================================================================
+ * External interfaces
+ *========================================================================*/
+
+#define ATTR_ROOT	0x0002	/* use attrs in root namespace, not user */
+#define ATTR_CREATE	0x0010	/* pure create: fail if attr already exists */
+#define ATTR_REPLACE	0x0020	/* pure set: fail if attr does not exist */
+#define ATTR_KERNOTIME	0x1000	/* [kernel] don't update inode timestamps */
+#define ATTR_KERNOVAL	0x2000	/* [kernel] get attr size only, not value */
+#define ATTR_KERNAMELS	0x4000	/* [kernel] list attr names (simple list) */
+#define ATTR_KERNFULLS	0x8000	/* [kernel] full attr list, ie. root+user */
+
+/*
+ * The maximum size (into the kernel or returned from the kernel) of an
+ * attribute value or the buffer used for an attr_list() call.	Larger
+ * sizes will result in an E2BIG return code.
+ */
+#define ATTR_MAX_VALUELEN	(64*1024)	/* max length of a value */
+
+/*
+ * Define how lists of attribute names are returned to the user from
+ * the attr_list() call.  A large, 32bit aligned, buffer is passed in
+ * along with its size.	 We put an array of offsets at the top that each
+ * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom.
+ */
+typedef struct attrlist {
+	__s32	al_count;	/* number of entries in attrlist */
+	__s32	al_more;	/* T/F: more attrs (do call again) */
+	__s32	al_offset[1];	/* byte offsets of attrs [var-sized] */
+} attrlist_t;
+
+/*
+ * Show the interesting info about one attribute.  This is what the
+ * al_offset[i] entry points to.
+ */
+typedef struct attrlist_ent {	/* data from attr_list() */
+	__u32	a_valuelen;	/* number bytes in value of attr */
+	char	a_name[1];	/* attr name (NULL terminated) */
+} attrlist_ent_t;
+
+/*
+ * Given a pointer to the (char*) buffer containing the attr_list() result,
+ * and an index, return a pointer to the indicated attribute in the buffer.
+ */
+#define ATTR_ENTRY(buffer, index)		\
+	((attrlist_ent_t *)			\
+	 &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ])
+
+/*
+ * Multi-attribute operation vector.
+ */
+typedef struct attr_multiop {
+	int	am_opcode;	/* operation to perform (ATTR_OP_GET, etc.) */
+	int	am_error;	/* [out arg] result of this sub-op (an errno) */
+	char	*am_attrname;	/* attribute name to work with */
+	char	*am_attrvalue;	/* [in/out arg] attribute value (raw bytes) */
+	int	am_length;	/* [in/out arg] length of value */
+	int	am_flags;	/* bitwise OR of attr API flags defined above */
+} attr_multiop_t;
+
+#define ATTR_OP_GET	1	/* return the indicated attr's value */
+#define ATTR_OP_SET	2	/* set/create the indicated attr/value pair */
+#define ATTR_OP_REMOVE	3	/* remove the indicated attr */
+
+/*
+ * Kernel-internal version of the attrlist cursor.
+ */
+typedef struct attrlist_cursor_kern {
+	__u32	hashval;	/* hash value of next entry to add */
+	__u32	blkno;		/* block containing entry (suggestion) */
+	__u32	offset;		/* offset in list of equal-hashvals */
+	__u16	pad1;		/* padding to match user-level */
+	__u8	pad2;		/* padding to match user-level */
+	__u8	initted;	/* T/F: cursor has been initialized */
+} attrlist_cursor_kern_t;
+
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+struct cred;
+struct vnode;
+struct xfs_inode;
+struct attrlist_cursor_kern;
+struct xfs_ext_attr;
+struct xfs_da_args;
+
+/*
+ * Overall external interface routines.
+ */
+int xfs_attr_get(bhv_desc_t *, char *, char *, int *, int, struct cred *);
+int xfs_attr_set(bhv_desc_t *, char *, char *, int, int, struct cred *);
+int xfs_attr_remove(bhv_desc_t *, char *, int, struct cred *);
+int xfs_attr_list(bhv_desc_t *, char *, int, int,
+			 struct attrlist_cursor_kern *, struct cred *);
+int xfs_attr_inactive(struct xfs_inode *dp);
+
+int xfs_attr_node_get(struct xfs_da_args *);
+int xfs_attr_leaf_get(struct xfs_da_args *);
+int xfs_attr_shortform_getvalue(struct xfs_da_args *);
+int xfs_attr_fetch(struct xfs_inode *, char *, char *, int);
+
+#endif	/* __XFS_ATTR_H__ */
diff --git a/fs/xfs/xfs_attr_fetch.c b/fs/xfs/xfs_attr_fetch.c
new file mode 100644
index 000000000000..0c9af54eeed5
--- /dev/null
+++ b/fs/xfs/xfs_attr_fetch.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+int
+xfs_attr_fetch(xfs_inode_t *ip, char *name, char *value, int valuelen)
+{
+	xfs_da_args_t args;
+	int error;
+
+	if (XFS_IFORK_Q(ip) == 0)
+		return ENOATTR;
+	/*
+	 * Do the argument setup for the xfs_attr routines.
+	 */
+	bzero((char *)&args, sizeof(args));
+	args.dp = ip;
+	args.flags = ATTR_ROOT;
+	args.whichfork = XFS_ATTR_FORK;
+	args.name = name;
+	args.namelen = strlen(name);
+	args.value = value;
+	args.valuelen = valuelen;
+	args.hashval = xfs_da_hashname(args.name, args.namelen);
+	args.oknoent = 1;
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (args.dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
+		error = xfs_attr_shortform_getvalue(&args);
+	else if (xfs_bmap_one_block(args.dp, XFS_ATTR_FORK))
+		error = xfs_attr_leaf_get(&args);
+	else
+		error = xfs_attr_node_get(&args);
+
+	if (error == EEXIST)
+		error = 0;
+
+	return(error);
+}
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
new file mode 100644
index 000000000000..884da53fa54d
--- /dev/null
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -0,0 +1,2971 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+/*
+ * xfs_attr_leaf.c
+ *
+ * GROT: figure out how to recover gracefully when bmap returns ENOSPC.
+ */
+
+#include <xfs.h>
+
+/*
+ * xfs_attr_leaf.c
+ *
+ * Routines to implement leaf blocks of attributes as Btrees of hashed names.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC int xfs_attr_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args,
+					      int freemap_index);
+STATIC void xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer);
+STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state,
+						   xfs_da_state_blk_t *blk1,
+						   xfs_da_state_blk_t *blk2);
+STATIC int xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
+					   xfs_da_state_blk_t *leaf_blk_1,
+					   xfs_da_state_blk_t *leaf_blk_2,
+					   int *number_entries_in_blk1,
+					   int *number_usedbytes_in_blk1);
+
+/*
+ * Utility routines.
+ */
+STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
+					 int src_start,
+					 xfs_attr_leafblock_t *dst_leaf,
+					 int dst_start, int move_count,
+					 xfs_mount_t *mp);
+
+
+/*========================================================================
+ * External routines when dirsize < XFS_LITINO(mp).
+ *========================================================================*/
+
+/*
+ * Create the initial contents of a shortform attribute list.
+ */
+int
+xfs_attr_shortform_create(xfs_da_args_t *args)
+{
+	xfs_attr_sf_hdr_t *hdr;
+	xfs_inode_t *dp;
+	xfs_ifork_t *ifp;
+
+	dp = args->dp;
+	ASSERT(dp != NULL);
+	ifp = dp->i_afp;
+	ASSERT(ifp != NULL);
+	ASSERT(ifp->if_bytes == 0);
+	if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) {
+		ifp->if_flags &= ~XFS_IFEXTENTS;	/* just in case */
+		dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL;
+		ifp->if_flags |= XFS_IFINLINE;
+	} else {
+		ASSERT(ifp->if_flags & XFS_IFINLINE);
+	}
+	xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
+	hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data;
+	INT_ZERO(hdr->count, ARCH_CONVERT);
+	INT_SET(hdr->totsize, ARCH_CONVERT, sizeof(*hdr));
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+	return(0);
+}
+
+/*
+ * Add a name/value pair to the shortform attribute list.
+ * Overflow from the inode has already been checked for.
+ */
+int
+xfs_attr_shortform_add(xfs_da_args_t *args)
+{
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	int i, offset, size;
+	xfs_inode_t *dp;
+	xfs_ifork_t *ifp;
+
+	dp = args->dp;
+	ifp = dp->i_afp;
+	ASSERT(ifp->if_flags & XFS_IFINLINE);
+	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+	sfe = &sf->list[0];
+	for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT);
+				sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+		if (sfe->namelen != args->namelen)
+			continue;
+		if (bcmp(args->name, sfe->nameval, args->namelen) != 0)
+			continue;
+		if (((args->flags & ATTR_ROOT) != 0) !=
+		    ((sfe->flags & XFS_ATTR_ROOT) != 0))
+			continue;
+		return(XFS_ERROR(EEXIST));
+	}
+
+	offset = (char *)sfe - (char *)sf;
+	size = XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
+	xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
+	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+	sfe = (xfs_attr_sf_entry_t *)((char *)sf + offset);
+
+	sfe->namelen = args->namelen;
+	INT_SET(sfe->valuelen, ARCH_CONVERT, args->valuelen);
+	sfe->flags = (args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0;
+	bcopy(args->name, sfe->nameval, args->namelen);
+	bcopy(args->value, &sfe->nameval[args->namelen], args->valuelen);
+	INT_MOD(sf->hdr.count, ARCH_CONVERT, 1);
+	INT_MOD(sf->hdr.totsize, ARCH_CONVERT, size);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+
+	return(0);
+}
+
+/*
+ * Remove a name from the shortform attribute list structure.
+ */
+int
+xfs_attr_shortform_remove(xfs_da_args_t *args)
+{
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	int base, size=0, end, totsize, i;
+	xfs_inode_t *dp;
+
+	/*
+	 * Remove the attribute.
+	 */
+	dp = args->dp;
+	base = sizeof(xfs_attr_sf_hdr_t);
+	sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+	sfe = &sf->list[0];
+	for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT);
+				sfe = XFS_ATTR_SF_NEXTENTRY(sfe),
+					base += size, i++) {
+		size = XFS_ATTR_SF_ENTSIZE(sfe);
+		if (sfe->namelen != args->namelen)
+			continue;
+		if (bcmp(sfe->nameval, args->name, args->namelen) != 0)
+			continue;
+		if (((args->flags & ATTR_ROOT) != 0) !=
+		    ((sfe->flags & XFS_ATTR_ROOT) != 0))
+			continue;
+		break;
+	}
+	if (i == INT_GET(sf->hdr.count, ARCH_CONVERT))
+		return(XFS_ERROR(ENOATTR));
+
+	end = base + size;
+	totsize = INT_GET(sf->hdr.totsize, ARCH_CONVERT);
+	if (end != totsize) {
+		ovbcopy(&((char *)sf)[end], &((char *)sf)[base],
+							totsize - end);
+	}
+	INT_MOD(sf->hdr.count, ARCH_CONVERT, -1);
+	INT_MOD(sf->hdr.totsize, ARCH_CONVERT, -size);
+	xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
+
+	return(0);
+}
+
+/*
+ * Look up a name in a shortform attribute list structure.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_lookup(xfs_da_args_t *args)
+{
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	int i;
+	xfs_ifork_t *ifp;
+
+	ifp = args->dp->i_afp;
+	ASSERT(ifp->if_flags & XFS_IFINLINE);
+	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+	sfe = &sf->list[0];
+	for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT);
+				sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+		if (sfe->namelen != args->namelen)
+			continue;
+		if (bcmp(args->name, sfe->nameval, args->namelen) != 0)
+			continue;
+		if (((args->flags & ATTR_ROOT) != 0) !=
+		    ((sfe->flags & XFS_ATTR_ROOT) != 0))
+			continue;
+		return(XFS_ERROR(EEXIST));
+	}
+	return(XFS_ERROR(ENOATTR));
+}
+
+/*
+ * Look up a name in a shortform attribute list structure.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_getvalue(xfs_da_args_t *args)
+{
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	int i;
+
+	ASSERT(args->dp->i_d.di_aformat == XFS_IFINLINE);
+	sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
+	sfe = &sf->list[0];
+	for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT);
+				sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) {
+		if (sfe->namelen != args->namelen)
+			continue;
+		if (bcmp(args->name, sfe->nameval, args->namelen) != 0)
+			continue;
+		if (((args->flags & ATTR_ROOT) != 0) !=
+		    ((sfe->flags & XFS_ATTR_ROOT) != 0))
+			continue;
+		if (args->flags & ATTR_KERNOVAL) {
+			args->valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT);
+			return(XFS_ERROR(EEXIST));
+		}
+		if (args->valuelen < INT_GET(sfe->valuelen, ARCH_CONVERT)) {
+			args->valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT);
+			return(XFS_ERROR(ERANGE));
+		}
+		args->valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT);
+		bcopy(&sfe->nameval[args->namelen], args->value,
+						    args->valuelen);
+		return(XFS_ERROR(EEXIST));
+	}
+	return(XFS_ERROR(ENOATTR));
+}
+
+/*
+ * Convert from using the shortform to the leaf.
+ */
+int
+xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
+{
+	xfs_inode_t *dp;
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	xfs_da_args_t nargs;
+	char *tmpbuffer;
+	int error, i, size;
+	xfs_dablk_t blkno;
+	xfs_dabuf_t *bp;
+	xfs_ifork_t *ifp;
+
+	dp = args->dp;
+	ifp = dp->i_afp;
+	sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data;
+	size = INT_GET(sf->hdr.totsize, ARCH_CONVERT);
+	tmpbuffer = kmem_alloc(size, KM_SLEEP);
+	ASSERT(tmpbuffer != NULL);
+	bcopy(ifp->if_u1.if_data, tmpbuffer, size);
+	sf = (xfs_attr_shortform_t *)tmpbuffer;
+
+	xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
+	bp = NULL;
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error) {
+		/*
+		 * If we hit an IO error middle of the transaction inside
+		 * grow_inode(), we may have inconsistent data. Bail out.
+		 */
+		if (error == EIO)
+			goto out;
+		xfs_idata_realloc(dp, size, XFS_ATTR_FORK);	/* try to put */
+		bcopy(tmpbuffer, ifp->if_u1.if_data, size);	/* it back */
+		goto out;
+	}
+
+	ASSERT(blkno == 0);
+	error = xfs_attr_leaf_create(args, blkno, &bp);
+	if (error) {
+		error = xfs_da_shrink_inode(args, 0, bp);
+		bp = NULL;
+		if (error)
+			goto out;
+		xfs_idata_realloc(dp, size, XFS_ATTR_FORK);	/* try to put */
+		bcopy(tmpbuffer, ifp->if_u1.if_data, size);	/* it back */
+		goto out;
+	}
+
+	bzero((char *)&nargs, sizeof(nargs));
+	nargs.dp = dp;
+	nargs.firstblock = args->firstblock;
+	nargs.flist = args->flist;
+	nargs.total = args->total;
+	nargs.whichfork = XFS_ATTR_FORK;
+	nargs.trans = args->trans;
+	nargs.oknoent = 1;
+
+	sfe = &sf->list[0];
+	for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
+		nargs.name = (char *)sfe->nameval;
+		nargs.namelen = sfe->namelen;
+		nargs.value = (char *)&sfe->nameval[nargs.namelen];
+		nargs.valuelen = INT_GET(sfe->valuelen, ARCH_CONVERT);
+		nargs.hashval = xfs_da_hashname((char *)sfe->nameval,
+						sfe->namelen);
+		nargs.flags = (sfe->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0;
+		error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */
+		ASSERT(error == ENOATTR);
+		error = xfs_attr_leaf_add(bp, &nargs);
+		ASSERT(error != ENOSPC);
+		if (error)
+			goto out;
+		sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+	}
+	error = 0;
+
+out:
+	if(bp)
+		xfs_da_buf_done(bp);
+	kmem_free(tmpbuffer, size);
+	return(error);
+}
+
+STATIC int
+xfs_attr_shortform_compare(const void *a, const void *b)
+{
+	xfs_attr_sf_sort_t *sa, *sb;
+
+	sa = (xfs_attr_sf_sort_t *)a;
+	sb = (xfs_attr_sf_sort_t *)b;
+	if (INT_GET(sa->hash, ARCH_CONVERT)
+				< INT_GET(sb->hash, ARCH_CONVERT)) {
+		return(-1);
+	} else if (INT_GET(sa->hash, ARCH_CONVERT)
+				> INT_GET(sb->hash, ARCH_CONVERT)) {
+		return(1);
+	} else {
+		return(sa->entno - sb->entno);
+	}
+}
+
+/*
+ * Copy out entries of shortform attribute lists for attr_list().
+ * Shortform atrtribute lists are not stored in hashval sorted order.
+ * If the output buffer is not large enough to hold them all, then we
+ * we have to calculate each entries' hashvalue and sort them before
+ * we can begin returning them to the user.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_shortform_list(xfs_attr_list_context_t *context)
+{
+	attrlist_cursor_kern_t *cursor;
+	xfs_attr_sf_sort_t *sbuf, *sbp;
+	xfs_attr_shortform_t *sf;
+	xfs_attr_sf_entry_t *sfe;
+	xfs_inode_t *dp;
+	int sbsize, nsbuf, count, i;
+
+	ASSERT(context != NULL);
+	dp = context->dp;
+	ASSERT(dp != NULL);
+	ASSERT(dp->i_afp != NULL);
+	sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
+	ASSERT(sf != NULL);
+	if (INT_ISZERO(sf->hdr.count, ARCH_CONVERT))
+		return(0);
+	cursor = context->cursor;
+	ASSERT(cursor != NULL);
+
+	xfs_attr_trace_l_c("sf start", context);
+
+	/*
+	 * If the buffer is large enough, do not bother with sorting.
+	 * Note the generous fudge factor of 16 overhead bytes per entry.
+	 */
+	if ((dp->i_afp->if_bytes + INT_GET(sf->hdr.count, ARCH_CONVERT) * 16)
+							< context->bufsize) {
+		for (i = 0, sfe = &sf->list[0];
+				i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
+			int ns = (sfe->flags & XFS_ATTR_ROOT)?
+						ROOT_NAMES : USER_NAMES;
+			if (((context->flags & ATTR_ROOT) != 0) !=
+			    ((sfe->flags & XFS_ATTR_ROOT) != 0) &&
+			    !(context->flags & ATTR_KERNFULLS)) {
+				sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+				continue;
+			}
+			if (context->flags & ATTR_KERNOVAL) {
+				ASSERT(context->flags & ATTR_KERNAMELS);
+				context->count += xfs_namespaces[ns].namelen +
+					INT_GET(sfe->namelen, ARCH_CONVERT) + 1;
+			}
+			else {
+				if (xfs_attr_put_listent(context, ns,
+						   (char *)sfe->nameval,
+						   (int)sfe->namelen,
+						   (int)INT_GET(sfe->valuelen,
+								ARCH_CONVERT)))
+					break;
+			}
+			sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+		}
+		xfs_attr_trace_l_c("sf big-gulp", context);
+		return(0);
+	}
+
+	/*
+	 * It didn't all fit, so we have to sort everything on hashval.
+	 */
+	sbsize = INT_GET(sf->hdr.count, ARCH_CONVERT) * sizeof(*sbuf);
+	sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
+
+	/*
+	 * Scan the attribute list for the rest of the entries, storing
+	 * the relevant info from only those that match into a buffer.
+	 */
+	nsbuf = 0;
+	for (i = 0, sfe = &sf->list[0];
+			i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
+		if (((char *)sfe < (char *)sf) ||
+		    ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)) ||
+		    (sfe->namelen >= MAXNAMELEN)) {
+			xfs_attr_trace_l_c("sf corrupted", context);
+			kmem_free(sbuf, sbsize);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+		if (((context->flags & ATTR_ROOT) != 0) !=
+		    ((sfe->flags & XFS_ATTR_ROOT) != 0) &&
+		    !(context->flags & ATTR_KERNFULLS)) {
+			sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+			continue;
+		}
+		sbp->entno = i;
+		INT_SET(sbp->hash, ARCH_CONVERT,
+			xfs_da_hashname((char *)sfe->nameval, sfe->namelen));
+		sbp->name = (char *)sfe->nameval;
+		sbp->namelen = sfe->namelen;
+		/* These are bytes, and both on-disk, don't endian-flip */
+		sbp->valuelen = sfe->valuelen;
+		sbp->flags = sfe->flags;
+		sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
+		sbp++;
+		nsbuf++;
+	}
+
+	/*
+	 * Sort the entries on hash then entno.
+	 */
+	qsort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare);
+
+	/*
+	 * Re-find our place IN THE SORTED LIST.
+	 */
+	count = 0;
+	cursor->initted = 1;
+	cursor->blkno = 0;
+	for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) {
+		if (INT_GET(sbp->hash, ARCH_CONVERT) == cursor->hashval) {
+			if (cursor->offset == count) {
+				break;
+			}
+			count++;
+		} else if (INT_GET(sbp->hash, ARCH_CONVERT) > cursor->hashval) {
+			break;
+		}
+	}
+	if (i == nsbuf) {
+		kmem_free(sbuf, sbsize);
+		xfs_attr_trace_l_c("blk end", context);
+		return(0);
+	}
+
+	/*
+	 * Loop putting entries into the user buffer.
+	 */
+	for ( ; i < nsbuf; i++, sbp++) {
+		int ns = (sbp->flags & XFS_ATTR_ROOT)? ROOT_NAMES:USER_NAMES;
+		if (cursor->hashval != INT_GET(sbp->hash, ARCH_CONVERT)) {
+			cursor->hashval = INT_GET(sbp->hash, ARCH_CONVERT);
+			cursor->offset = 0;
+		}
+		if (context->flags & ATTR_KERNOVAL) {
+			ASSERT(context->flags & ATTR_KERNAMELS);
+			context->count += xfs_namespaces[ns].namelen
+					+ sbp->namelen + 1;
+		}
+		else {
+			if (xfs_attr_put_listent(context, ns,
+					sbp->name, sbp->namelen,
+					INT_GET(sbp->valuelen, ARCH_CONVERT)))
+				break;
+		}
+		cursor->offset++;
+	}
+
+	kmem_free(sbuf, sbsize);
+	xfs_attr_trace_l_c("sf E-O-F", context);
+	return(0);
+}
+
+/*
+ * Check a leaf attribute block to see if all the entries would fit into
+ * a shortform attribute list.
+ */
+int
+xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_local_t *name_loc;
+	int bytes, i;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+
+	entry = &leaf->entries[0];
+	bytes = sizeof(struct xfs_attr_sf_hdr);
+	for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) {
+		if (entry->flags & XFS_ATTR_INCOMPLETE)
+			continue;		/* don't copy partial entries */
+		if (!(entry->flags & XFS_ATTR_LOCAL))
+			return(0);
+		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
+		if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
+			return(0);
+		if (INT_GET(name_loc->valuelen, ARCH_CONVERT) >= XFS_ATTR_SF_ENTSIZE_MAX)
+			return(0);
+		bytes += sizeof(struct xfs_attr_sf_entry)-1
+				+ name_loc->namelen
+				+ INT_GET(name_loc->valuelen, ARCH_CONVERT);
+	}
+	return( bytes < XFS_IFORK_ASIZE(dp) );
+}
+
+/*
+ * Convert a leaf attribute list to shortform attribute list
+ */
+int
+xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_local_t *name_loc;
+	xfs_da_args_t nargs;
+	xfs_inode_t *dp;
+	char *tmpbuffer;
+	int error, i;
+
+	dp = args->dp;
+	tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
+	ASSERT(tmpbuffer != NULL);
+
+	ASSERT(bp != NULL);
+	bcopy(bp->data, tmpbuffer, XFS_LBSIZE(dp->i_mount));
+	leaf = (xfs_attr_leafblock_t *)tmpbuffer;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	bzero(bp->data, XFS_LBSIZE(dp->i_mount));
+
+	/*
+	 * Clean out the prior contents of the attribute list.
+	 */
+	error = xfs_da_shrink_inode(args, 0, bp);
+	if (error)
+		goto out;
+	error = xfs_attr_shortform_create(args);
+	if (error)
+		goto out;
+
+	/*
+	 * Copy the attributes
+	 */
+	bzero((char *)&nargs, sizeof(nargs));
+	nargs.dp = dp;
+	nargs.firstblock = args->firstblock;
+	nargs.flist = args->flist;
+	nargs.total = args->total;
+	nargs.whichfork = XFS_ATTR_FORK;
+	nargs.trans = args->trans;
+	nargs.oknoent = 1;
+	entry = &leaf->entries[0];
+	for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) {
+		if (entry->flags & XFS_ATTR_INCOMPLETE)
+			continue;	/* don't copy partial entries */
+		if (INT_ISZERO(entry->nameidx, ARCH_CONVERT))
+			continue;
+		ASSERT(entry->flags & XFS_ATTR_LOCAL);
+		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
+		nargs.name = (char *)name_loc->nameval;
+		nargs.namelen = name_loc->namelen;
+		nargs.value = (char *)&name_loc->nameval[nargs.namelen];
+		nargs.valuelen = INT_GET(name_loc->valuelen, ARCH_CONVERT);
+		nargs.hashval = INT_GET(entry->hashval, ARCH_CONVERT);
+		nargs.flags = (entry->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0;
+		xfs_attr_shortform_add(&nargs);
+	}
+	error = 0;
+
+out:
+	kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount));
+	return(error);
+}
+
+/*
+ * Convert from using a single leaf to a root node and a leaf.
+ */
+int
+xfs_attr_leaf_to_node(xfs_da_args_t *args)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_da_intnode_t *node;
+	xfs_inode_t *dp;
+	xfs_dabuf_t *bp1, *bp2;
+	xfs_dablk_t blkno;
+	int error;
+
+	dp = args->dp;
+	bp1 = bp2 = NULL;
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error)
+		goto out;
+	error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1,
+					     XFS_ATTR_FORK);
+	if (error)
+		goto out;
+	ASSERT(bp1 != NULL);
+	bp2 = NULL;
+	error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2,
+					    XFS_ATTR_FORK);
+	if (error)
+		goto out;
+	ASSERT(bp2 != NULL);
+	bcopy(bp1->data, bp2->data, XFS_LBSIZE(dp->i_mount));
+	xfs_da_buf_done(bp1);
+	bp1 = NULL;
+	xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
+
+	/*
+	 * Set up the new root node.
+	 */
+	error = xfs_da_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK);
+	if (error)
+		goto out;
+	node = bp1->data;
+	leaf = bp2->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	/* both on-disk, don't endian-flip twice */
+	node->btree[0].hashval =
+		leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval;
+	INT_SET(node->btree[0].before, ARCH_CONVERT, blkno);
+	INT_SET(node->hdr.count, ARCH_CONVERT, 1);
+	xfs_da_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1);
+	error = 0;
+out:
+	if (bp1)
+		xfs_da_buf_done(bp1);
+	if (bp2)
+		xfs_da_buf_done(bp2);
+	return(error);
+}
+
+
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+
+/*
+ * Create the initial contents of a leaf attribute list
+ * or a leaf in a node attribute list.
+ */
+int
+xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_hdr_t *hdr;
+	xfs_inode_t *dp;
+	xfs_dabuf_t *bp;
+	int error;
+
+	dp = args->dp;
+	ASSERT(dp != NULL);
+	error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp,
+					    XFS_ATTR_FORK);
+	if (error)
+		return(error);
+	ASSERT(bp != NULL);
+	leaf = bp->data;
+	bzero((char *)leaf, XFS_LBSIZE(dp->i_mount));
+	hdr = &leaf->hdr;
+	INT_SET(hdr->info.magic, ARCH_CONVERT, XFS_ATTR_LEAF_MAGIC);
+	INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount));
+	if (INT_ISZERO(hdr->firstused, ARCH_CONVERT)) {
+		INT_SET(hdr->firstused, ARCH_CONVERT,
+			XFS_LBSIZE(dp->i_mount) - XFS_ATTR_LEAF_NAME_ALIGN);
+	}
+
+	INT_SET(hdr->freemap[0].base, ARCH_CONVERT,
+						sizeof(xfs_attr_leaf_hdr_t));
+	INT_SET(hdr->freemap[0].size, ARCH_CONVERT,
+					  INT_GET(hdr->firstused, ARCH_CONVERT)
+					- INT_GET(hdr->freemap[0].base,
+								ARCH_CONVERT));
+
+	xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1);
+
+	*bpp = bp;
+	return(0);
+}
+
+/*
+ * Split the leaf node, rebalance, then add the new entry.
+ */
+int
+xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
+				   xfs_da_state_blk_t *newblk)
+{
+	xfs_dablk_t blkno;
+	int error;
+
+	/*
+	 * Allocate space for a new leaf node.
+	 */
+	ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC);
+	error = xfs_da_grow_inode(state->args, &blkno);
+	if (error)
+		return(error);
+	error = xfs_attr_leaf_create(state->args, blkno, &newblk->bp);
+	if (error)
+		return(error);
+	newblk->blkno = blkno;
+	newblk->magic = XFS_ATTR_LEAF_MAGIC;
+
+	/*
+	 * Rebalance the entries across the two leaves.
+	 * NOTE: rebalance() currently depends on the 2nd block being empty.
+	 */
+	xfs_attr_leaf_rebalance(state, oldblk, newblk);
+	error = xfs_da_blk_link(state, oldblk, newblk);
+	if (error)
+		return(error);
+
+	/*
+	 * Save info on "old" attribute for "atomic rename" ops, leaf_add()
+	 * modifies the index/blkno/rmtblk/rmtblkcnt fields to show the
+	 * "new" attrs info.  Will need the "old" info to remove it later.
+	 *
+	 * Insert the "new" entry in the correct block.
+	 */
+	if (state->inleaf)
+		error = xfs_attr_leaf_add(oldblk->bp, state->args);
+	else
+		error = xfs_attr_leaf_add(newblk->bp, state->args);
+
+	/*
+	 * Update last hashval in each block since we added the name.
+	 */
+	oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL);
+	newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL);
+	return(error);
+}
+
+/*
+ * Add a name to the leaf attribute list structure.
+ */
+int
+xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_hdr_t *hdr;
+	xfs_attr_leaf_map_t *map;
+	int tablesize, entsize, sum, tmp, i;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	ASSERT((args->index >= 0)
+		&& (args->index <= INT_GET(leaf->hdr.count, ARCH_CONVERT)));
+	hdr = &leaf->hdr;
+	entsize = xfs_attr_leaf_newentsize(args,
+			   args->trans->t_mountp->m_sb.sb_blocksize, NULL);
+
+	/*
+	 * Search through freemap for first-fit on new name length.
+	 * (may need to figure in size of entry struct too)
+	 */
+	tablesize = (INT_GET(hdr->count, ARCH_CONVERT) + 1)
+					* sizeof(xfs_attr_leaf_entry_t)
+					+ sizeof(xfs_attr_leaf_hdr_t);
+	map = &hdr->freemap[XFS_ATTR_LEAF_MAPSIZE-1];
+	for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE-1; i >= 0; map--, i--) {
+		if (tablesize > INT_GET(hdr->firstused, ARCH_CONVERT)) {
+			sum += INT_GET(map->size, ARCH_CONVERT);
+			continue;
+		}
+		if (INT_ISZERO(map->size, ARCH_CONVERT))
+			continue;	/* no space in this map */
+		tmp = entsize;
+		if (INT_GET(map->base, ARCH_CONVERT)
+				< INT_GET(hdr->firstused, ARCH_CONVERT))
+			tmp += sizeof(xfs_attr_leaf_entry_t);
+		if (INT_GET(map->size, ARCH_CONVERT) >= tmp) {
+			tmp = xfs_attr_leaf_add_work(bp, args, i);
+			return(tmp);
+		}
+		sum += INT_GET(map->size, ARCH_CONVERT);
+	}
+
+	/*
+	 * If there are no holes in the address space of the block,
+	 * and we don't have enough freespace, then compaction will do us
+	 * no good and we should just give up.
+	 */
+	if (!hdr->holes && (sum < entsize))
+		return(XFS_ERROR(ENOSPC));
+
+	/*
+	 * Compact the entries to coalesce free space.
+	 * This may change the hdr->count via dropping INCOMPLETE entries.
+	 */
+	xfs_attr_leaf_compact(args->trans, bp);
+
+	/*
+	 * After compaction, the block is guaranteed to have only one
+	 * free region, in freemap[0].	If it is not big enough, give up.
+	 */
+	if (INT_GET(hdr->freemap[0].size, ARCH_CONVERT)
+				< (entsize + sizeof(xfs_attr_leaf_entry_t)))
+		return(XFS_ERROR(ENOSPC));
+
+	return(xfs_attr_leaf_add_work(bp, args, 0));
+}
+
+/*
+ * Add a name to a leaf attribute list structure.
+ */
+STATIC int
+xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_hdr_t *hdr;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_local_t *name_loc;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	xfs_attr_leaf_map_t *map;
+	xfs_mount_t *mp;
+	int tmp, i;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	hdr = &leaf->hdr;
+	ASSERT((mapindex >= 0) && (mapindex < XFS_ATTR_LEAF_MAPSIZE));
+	ASSERT((args->index >= 0)
+		&& (args->index <= INT_GET(hdr->count, ARCH_CONVERT)));
+
+	/*
+	 * Force open some space in the entry array and fill it in.
+	 */
+	entry = &leaf->entries[args->index];
+	if (args->index < INT_GET(hdr->count, ARCH_CONVERT)) {
+		tmp  = INT_GET(hdr->count, ARCH_CONVERT) - args->index;
+		tmp *= sizeof(xfs_attr_leaf_entry_t);
+		ovbcopy((char *)entry, (char *)(entry+1), tmp);
+		xfs_da_log_buf(args->trans, bp,
+		    XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
+	}
+	INT_MOD(hdr->count, ARCH_CONVERT, 1);
+
+	/*
+	 * Allocate space for the new string (at the end of the run).
+	 */
+	map = &hdr->freemap[mapindex];
+	mp = args->trans->t_mountp;
+	ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
+	ASSERT((INT_GET(map->base, ARCH_CONVERT) & 0x3) == 0);
+	ASSERT(INT_GET(map->size, ARCH_CONVERT)
+				>= xfs_attr_leaf_newentsize(args,
+					     mp->m_sb.sb_blocksize, NULL));
+	ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
+	ASSERT((INT_GET(map->size, ARCH_CONVERT) & 0x3) == 0);
+	INT_MOD(map->size, ARCH_CONVERT,
+		-xfs_attr_leaf_newentsize(args, mp->m_sb.sb_blocksize, &tmp));
+	INT_SET(entry->nameidx, ARCH_CONVERT,
+					INT_GET(map->base, ARCH_CONVERT)
+				      + INT_GET(map->size, ARCH_CONVERT));
+	INT_SET(entry->hashval, ARCH_CONVERT, args->hashval);
+	entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
+	entry->flags |= (args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0;
+	if (args->rename) {
+		entry->flags |= XFS_ATTR_INCOMPLETE;
+		if ((args->blkno2 == args->blkno) &&
+		    (args->index2 <= args->index)) {
+			args->index2++;
+		}
+	}
+	xfs_da_log_buf(args->trans, bp,
+			  XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+	ASSERT((args->index == 0) || (INT_GET(entry->hashval, ARCH_CONVERT)
+						>= INT_GET((entry-1)->hashval,
+							    ARCH_CONVERT)));
+	ASSERT((args->index == INT_GET(hdr->count, ARCH_CONVERT)-1) ||
+	       (INT_GET(entry->hashval, ARCH_CONVERT)
+			    <= (INT_GET((entry+1)->hashval, ARCH_CONVERT))));
+
+	/*
+	 * Copy the attribute name and value into the new space.
+	 *
+	 * For "remote" attribute values, simply note that we need to
+	 * allocate space for the "remote" value.  We can't actually
+	 * allocate the extents in this transaction, and we can't decide
+	 * which blocks they should be as we might allocate more blocks
+	 * as part of this transaction (a split operation for example).
+	 */
+	if (entry->flags & XFS_ATTR_LOCAL) {
+		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
+		name_loc->namelen = args->namelen;
+		INT_SET(name_loc->valuelen, ARCH_CONVERT, args->valuelen);
+		bcopy(args->name, (char *)name_loc->nameval, args->namelen);
+		bcopy(args->value, (char *)&name_loc->nameval[args->namelen],
+				   INT_GET(name_loc->valuelen, ARCH_CONVERT));
+	} else {
+		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+		name_rmt->namelen = args->namelen;
+		bcopy(args->name, (char *)name_rmt->name, args->namelen);
+		entry->flags |= XFS_ATTR_INCOMPLETE;
+		/* just in case */
+		INT_ZERO(name_rmt->valuelen, ARCH_CONVERT);
+		INT_ZERO(name_rmt->valueblk, ARCH_CONVERT);
+		args->rmtblkno = 1;
+		args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen);
+	}
+	xfs_da_log_buf(args->trans, bp,
+	     XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index),
+				   xfs_attr_leaf_entsize(leaf, args->index)));
+
+	/*
+	 * Update the control info for this leaf node
+	 */
+	if (INT_GET(entry->nameidx, ARCH_CONVERT)
+				< INT_GET(hdr->firstused, ARCH_CONVERT)) {
+		/* both on-disk, don't endian-flip twice */
+		hdr->firstused = entry->nameidx;
+	}
+	ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT)
+				>= ((INT_GET(hdr->count, ARCH_CONVERT)
+					* sizeof(*entry))+sizeof(*hdr)));
+	tmp = (INT_GET(hdr->count, ARCH_CONVERT)-1)
+					* sizeof(xfs_attr_leaf_entry_t)
+					+ sizeof(xfs_attr_leaf_hdr_t);
+	map = &hdr->freemap[0];
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) {
+		if (INT_GET(map->base, ARCH_CONVERT) == tmp) {
+			INT_MOD(map->base, ARCH_CONVERT,
+					sizeof(xfs_attr_leaf_entry_t));
+			INT_MOD(map->size, ARCH_CONVERT,
+					-sizeof(xfs_attr_leaf_entry_t));
+		}
+	}
+	INT_MOD(hdr->usedbytes, ARCH_CONVERT,
+				xfs_attr_leaf_entsize(leaf, args->index));
+	xfs_da_log_buf(args->trans, bp,
+		XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
+	return(0);
+}
+
+/*
+ * Garbage collect a leaf attribute list block by copying it to a new buffer.
+ */
+STATIC void
+xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp)
+{
+	xfs_attr_leafblock_t *leaf_s, *leaf_d;
+	xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
+	xfs_mount_t *mp;
+	char *tmpbuffer;
+
+	mp = trans->t_mountp;
+	tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
+	ASSERT(tmpbuffer != NULL);
+	bcopy(bp->data, tmpbuffer, XFS_LBSIZE(mp));
+	bzero(bp->data, XFS_LBSIZE(mp));
+
+	/*
+	 * Copy basic information
+	 */
+	leaf_s = (xfs_attr_leafblock_t *)tmpbuffer;
+	leaf_d = bp->data;
+	hdr_s = &leaf_s->hdr;
+	hdr_d = &leaf_d->hdr;
+	hdr_d->info = hdr_s->info;	/* struct copy */
+	INT_SET(hdr_d->firstused, ARCH_CONVERT, XFS_LBSIZE(mp));
+	/* handle truncation gracefully */
+	if (INT_ISZERO(hdr_d->firstused, ARCH_CONVERT)) {
+		INT_SET(hdr_d->firstused, ARCH_CONVERT,
+				XFS_LBSIZE(mp) - XFS_ATTR_LEAF_NAME_ALIGN);
+	}
+	INT_ZERO(hdr_d->usedbytes, ARCH_CONVERT);
+	INT_ZERO(hdr_d->count, ARCH_CONVERT);
+	hdr_d->holes = 0;
+	INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT,
+					sizeof(xfs_attr_leaf_hdr_t));
+	INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT,
+				INT_GET(hdr_d->firstused, ARCH_CONVERT)
+			      - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
+
+	/*
+	 * Copy all entry's in the same (sorted) order,
+	 * but allocate name/value pairs packed and in sequence.
+	 */
+	xfs_attr_leaf_moveents(leaf_s, 0, leaf_d, 0,
+				(int)INT_GET(hdr_s->count, ARCH_CONVERT), mp);
+
+	xfs_da_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1);
+
+	kmem_free(tmpbuffer, XFS_LBSIZE(mp));
+}
+
+/*
+ * Redistribute the attribute list entries between two leaf nodes,
+ * taking into account the size of the new entry.
+ *
+ * NOTE: if new block is empty, then it will get the upper half of the
+ * old block.  At present, all (one) callers pass in an empty second block.
+ *
+ * This code adjusts the args->index/blkno and args->index2/blkno2 fields
+ * to match what it is doing in splitting the attribute leaf block.  Those
+ * values are used in "atomic rename" operations on attributes.	 Note that
+ * the "new" and "old" values can end up in different blocks.
+ */
+STATIC void
+xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
+				       xfs_da_state_blk_t *blk2)
+{
+	xfs_da_args_t *args;
+	xfs_da_state_blk_t *tmp_blk;
+	xfs_attr_leafblock_t *leaf1, *leaf2;
+	xfs_attr_leaf_hdr_t *hdr1, *hdr2;
+	int count, totallen, max, space, swap;
+
+	/*
+	 * Set up environment.
+	 */
+	ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
+	leaf1 = blk1->bp->data;
+	leaf2 = blk2->bp->data;
+	ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	args = state->args;
+
+	/*
+	 * Check ordering of blocks, reverse if it makes things simpler.
+	 *
+	 * NOTE: Given that all (current) callers pass in an empty
+	 * second block, this code should never set "swap".
+	 */
+	swap = 0;
+	if (xfs_attr_leaf_order(blk1->bp, blk2->bp)) {
+		tmp_blk = blk1;
+		blk1 = blk2;
+		blk2 = tmp_blk;
+		leaf1 = blk1->bp->data;
+		leaf2 = blk2->bp->data;
+		swap = 1;
+	}
+	hdr1 = &leaf1->hdr;
+	hdr2 = &leaf2->hdr;
+
+	/*
+	 * Examine entries until we reduce the absolute difference in
+	 * byte usage between the two blocks to a minimum.  Then get
+	 * the direction to copy and the number of elements to move.
+	 *
+	 * "inleaf" is true if the new entry should be inserted into blk1.
+	 * If "swap" is also true, then reverse the sense of "inleaf".
+	 */
+	state->inleaf = xfs_attr_leaf_figure_balance(state, blk1, blk2,
+							    &count, &totallen);
+	if (swap)
+		state->inleaf = !state->inleaf;
+
+	/*
+	 * Move any entries required from leaf to leaf:
+	 */
+	if (count < INT_GET(hdr1->count, ARCH_CONVERT)) {
+		/*
+		 * Figure the total bytes to be added to the destination leaf.
+		 */
+		/* number entries being moved */
+		count = INT_GET(hdr1->count, ARCH_CONVERT) - count;
+		space  = INT_GET(hdr1->usedbytes, ARCH_CONVERT) - totallen;
+		space += count * sizeof(xfs_attr_leaf_entry_t);
+
+		/*
+		 * leaf2 is the destination, compact it if it looks tight.
+		 */
+		max  = INT_GET(hdr2->firstused, ARCH_CONVERT)
+						- sizeof(xfs_attr_leaf_hdr_t);
+		max -= INT_GET(hdr2->count, ARCH_CONVERT)
+					* sizeof(xfs_attr_leaf_entry_t);
+		if (space > max) {
+			xfs_attr_leaf_compact(args->trans, blk2->bp);
+		}
+
+		/*
+		 * Move high entries from leaf1 to low end of leaf2.
+		 */
+		xfs_attr_leaf_moveents(leaf1,
+				INT_GET(hdr1->count, ARCH_CONVERT)-count,
+				leaf2, 0, count, state->mp);
+
+		xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
+		xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
+	} else if (count > INT_GET(hdr1->count, ARCH_CONVERT)) {
+		/*
+		 * I assert that since all callers pass in an empty
+		 * second buffer, this code should never execute.
+		 */
+
+		/*
+		 * Figure the total bytes to be added to the destination leaf.
+		 */
+		/* number entries being moved */
+		count -= INT_GET(hdr1->count, ARCH_CONVERT);
+		space  = totallen - INT_GET(hdr1->usedbytes, ARCH_CONVERT);
+		space += count * sizeof(xfs_attr_leaf_entry_t);
+
+		/*
+		 * leaf1 is the destination, compact it if it looks tight.
+		 */
+		max  = INT_GET(hdr1->firstused, ARCH_CONVERT)
+						- sizeof(xfs_attr_leaf_hdr_t);
+		max -= INT_GET(hdr1->count, ARCH_CONVERT)
+					* sizeof(xfs_attr_leaf_entry_t);
+		if (space > max) {
+			xfs_attr_leaf_compact(args->trans, blk1->bp);
+		}
+
+		/*
+		 * Move low entries from leaf2 to high end of leaf1.
+		 */
+		xfs_attr_leaf_moveents(leaf2, 0, leaf1,
+				(int)INT_GET(hdr1->count, ARCH_CONVERT), count,
+				state->mp);
+
+		xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
+		xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
+	}
+
+	/*
+	 * Copy out last hashval in each block for B-tree code.
+	 */
+	blk1->hashval =
+	    INT_GET(leaf1->entries[INT_GET(leaf1->hdr.count,
+				    ARCH_CONVERT)-1].hashval, ARCH_CONVERT);
+	blk2->hashval =
+	    INT_GET(leaf2->entries[INT_GET(leaf2->hdr.count,
+				    ARCH_CONVERT)-1].hashval, ARCH_CONVERT);
+
+	/*
+	 * Adjust the expected index for insertion.
+	 * NOTE: this code depends on the (current) situation that the
+	 * second block was originally empty.
+	 *
+	 * If the insertion point moved to the 2nd block, we must adjust
+	 * the index.  We must also track the entry just following the
+	 * new entry for use in an "atomic rename" operation, that entry
+	 * is always the "old" entry and the "new" entry is what we are
+	 * inserting.  The index/blkno fields refer to the "old" entry,
+	 * while the index2/blkno2 fields refer to the "new" entry.
+	 */
+	if (blk1->index > INT_GET(leaf1->hdr.count, ARCH_CONVERT)) {
+		ASSERT(state->inleaf == 0);
+		blk2->index = blk1->index
+				- INT_GET(leaf1->hdr.count, ARCH_CONVERT);
+		args->index = args->index2 = blk2->index;
+		args->blkno = args->blkno2 = blk2->blkno;
+	} else if (blk1->index == INT_GET(leaf1->hdr.count, ARCH_CONVERT)) {
+		if (state->inleaf) {
+			args->index = blk1->index;
+			args->blkno = blk1->blkno;
+			args->index2 = 0;
+			args->blkno2 = blk2->blkno;
+		} else {
+			blk2->index = blk1->index
+				    - INT_GET(leaf1->hdr.count, ARCH_CONVERT);
+			args->index = args->index2 = blk2->index;
+			args->blkno = args->blkno2 = blk2->blkno;
+		}
+	} else {
+		ASSERT(state->inleaf == 1);
+		args->index = args->index2 = blk1->index;
+		args->blkno = args->blkno2 = blk1->blkno;
+	}
+}
+
+/*
+ * Examine entries until we reduce the absolute difference in
+ * byte usage between the two blocks to a minimum.
+ * GROT: Is this really necessary?  With other than a 512 byte blocksize,
+ * GROT: there will always be enough room in either block for a new entry.
+ * GROT: Do a double-split for this case?
+ */
+STATIC int
+xfs_attr_leaf_figure_balance(xfs_da_state_t *state,
+				    xfs_da_state_blk_t *blk1,
+				    xfs_da_state_blk_t *blk2,
+				    int *countarg, int *usedbytesarg)
+{
+	xfs_attr_leafblock_t *leaf1, *leaf2;
+	xfs_attr_leaf_hdr_t *hdr1, *hdr2;
+	xfs_attr_leaf_entry_t *entry;
+	int count, max, index, totallen, half;
+	int lastdelta, foundit, tmp;
+
+	/*
+	 * Set up environment.
+	 */
+	leaf1 = blk1->bp->data;
+	leaf2 = blk2->bp->data;
+	hdr1 = &leaf1->hdr;
+	hdr2 = &leaf2->hdr;
+	foundit = 0;
+	totallen = 0;
+
+	/*
+	 * Examine entries until we reduce the absolute difference in
+	 * byte usage between the two blocks to a minimum.
+	 */
+	max = INT_GET(hdr1->count, ARCH_CONVERT)
+			+ INT_GET(hdr2->count, ARCH_CONVERT);
+	half  = (max+1) * sizeof(*entry);
+	half += INT_GET(hdr1->usedbytes, ARCH_CONVERT)
+				+ INT_GET(hdr2->usedbytes, ARCH_CONVERT)
+				+ xfs_attr_leaf_newentsize(state->args,
+						     state->blocksize, NULL);
+	half /= 2;
+	lastdelta = state->blocksize;
+	entry = &leaf1->entries[0];
+	for (count = index = 0; count < max; entry++, index++, count++) {
+
+#define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A))
+		/*
+		 * The new entry is in the first block, account for it.
+		 */
+		if (count == blk1->index) {
+			tmp = totallen + sizeof(*entry) +
+				xfs_attr_leaf_newentsize(state->args,
+							 state->blocksize,
+							 NULL);
+			if (XFS_ATTR_ABS(half - tmp) > lastdelta)
+				break;
+			lastdelta = XFS_ATTR_ABS(half - tmp);
+			totallen = tmp;
+			foundit = 1;
+		}
+
+		/*
+		 * Wrap around into the second block if necessary.
+		 */
+		if (count == INT_GET(hdr1->count, ARCH_CONVERT)) {
+			leaf1 = leaf2;
+			entry = &leaf1->entries[0];
+			index = 0;
+		}
+
+		/*
+		 * Figure out if next leaf entry would be too much.
+		 */
+		tmp = totallen + sizeof(*entry) + xfs_attr_leaf_entsize(leaf1,
+									index);
+		if (XFS_ATTR_ABS(half - tmp) > lastdelta)
+			break;
+		lastdelta = XFS_ATTR_ABS(half - tmp);
+		totallen = tmp;
+#undef XFS_ATTR_ABS
+	}
+
+	/*
+	 * Calculate the number of usedbytes that will end up in lower block.
+	 * If new entry not in lower block, fix up the count.
+	 */
+	totallen -= count * sizeof(*entry);
+	if (foundit) {
+		totallen -= sizeof(*entry) +
+				xfs_attr_leaf_newentsize(state->args,
+							 state->blocksize,
+							 NULL);
+	}
+
+	*countarg = count;
+	*usedbytesarg = totallen;
+	return(foundit);
+}
+
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ *
+ * GROT: allow for INCOMPLETE entries in calculation.
+ */
+int
+xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_da_state_blk_t *blk;
+	xfs_da_blkinfo_t *info;
+	int count, bytes, forward, error, retval, i;
+	xfs_dablk_t blkno;
+	xfs_dabuf_t *bp;
+
+	/*
+	 * Check for the degenerate case of the block being over 50% full.
+	 * If so, it's not worth even looking to see if we might be able
+	 * to coalesce with a sibling.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	info = blk->bp->data;
+	ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC);
+	leaf = (xfs_attr_leafblock_t *)info;
+	count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+	bytes = sizeof(xfs_attr_leaf_hdr_t) +
+		count * sizeof(xfs_attr_leaf_entry_t) +
+		INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT);
+	if (bytes > (state->blocksize >> 1)) {
+		*action = 0;	/* blk over 50%, dont try to join */
+		return(0);
+	}
+
+	/*
+	 * Check for the degenerate case of the block being empty.
+	 * If the block is empty, we'll simply delete it, no need to
+	 * coalesce it with a sibling block.  We choose (aribtrarily)
+	 * to merge with the forward block unless it is NULL.
+	 */
+	if (count == 0) {
+		/*
+		 * Make altpath point to the block we want to keep and
+		 * path point to the block we want to drop (this one).
+		 */
+		forward = (!INT_ISZERO(info->forw, ARCH_CONVERT));
+		bcopy(&state->path, &state->altpath, sizeof(state->path));
+		error = xfs_da_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+		if (error)
+			return(error);
+		if (retval) {
+			*action = 0;
+		} else {
+			*action = 2;
+		}
+		return(0);
+	}
+
+	/*
+	 * Examine each sibling block to see if we can coalesce with
+	 * at least 25% free space to spare.  We need to figure out
+	 * whether to merge with the forward or the backward block.
+	 * We prefer coalescing with the lower numbered sibling so as
+	 * to shrink an attribute list over time.
+	 */
+	/* start with smaller blk num */
+	forward = (INT_GET(info->forw, ARCH_CONVERT)
+					< INT_GET(info->back, ARCH_CONVERT));
+	for (i = 0; i < 2; forward = !forward, i++) {
+		if (forward)
+			blkno = INT_GET(info->forw, ARCH_CONVERT);
+		else
+			blkno = INT_GET(info->back, ARCH_CONVERT);
+		if (blkno == 0)
+			continue;
+		error = xfs_da_read_buf(state->args->trans, state->args->dp,
+					blkno, -1, &bp, XFS_ATTR_FORK);
+		if (error)
+			return(error);
+		ASSERT(bp != NULL);
+
+		leaf = (xfs_attr_leafblock_t *)info;
+		count  = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+		bytes  = state->blocksize - (state->blocksize>>2);
+		bytes -= INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT);
+		leaf = bp->data;
+		ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+		count += INT_GET(leaf->hdr.count, ARCH_CONVERT);
+		bytes -= INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT);
+		bytes -= count * sizeof(xfs_attr_leaf_entry_t);
+		bytes -= sizeof(xfs_attr_leaf_hdr_t);
+		xfs_da_brelse(state->args->trans, bp);
+		if (bytes >= 0)
+			break;	/* fits with at least 25% to spare */
+	}
+	if (i >= 2) {
+		*action = 0;
+		return(0);
+	}
+
+	/*
+	 * Make altpath point to the block we want to keep (the lower
+	 * numbered block) and path point to the block we want to drop.
+	 */
+	bcopy(&state->path, &state->altpath, sizeof(state->path));
+	if (blkno < blk->blkno) {
+		error = xfs_da_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+	} else {
+		error = xfs_da_path_shift(state, &state->path, forward,
+						 0, &retval);
+	}
+	if (error)
+		return(error);
+	if (retval) {
+		*action = 0;
+	} else {
+		*action = 1;
+	}
+	return(0);
+}
+
+/*
+ * Remove a name from the leaf attribute list structure.
+ *
+ * Return 1 if leaf is less than 37% full, 0 if >= 37% full.
+ * If two leaves are 37% full, when combined they will leave 25% free.
+ */
+int
+xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_hdr_t *hdr;
+	xfs_attr_leaf_map_t *map;
+	xfs_attr_leaf_entry_t *entry;
+	int before, after, smallest, entsize;
+	int tablesize, tmp, i;
+	xfs_mount_t *mp;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	hdr = &leaf->hdr;
+	mp = args->trans->t_mountp;
+	ASSERT((INT_GET(hdr->count, ARCH_CONVERT) > 0)
+		&& (INT_GET(hdr->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)));
+	ASSERT((args->index >= 0)
+		&& (args->index < INT_GET(hdr->count, ARCH_CONVERT)));
+	ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT)
+				>= ((INT_GET(hdr->count, ARCH_CONVERT)
+					* sizeof(*entry))+sizeof(*hdr)));
+	entry = &leaf->entries[args->index];
+	ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT)
+				>= INT_GET(hdr->firstused, ARCH_CONVERT));
+	ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp));
+
+	/*
+	 * Scan through free region table:
+	 *    check for adjacency of free'd entry with an existing one,
+	 *    find smallest free region in case we need to replace it,
+	 *    adjust any map that borders the entry table,
+	 */
+	tablesize = INT_GET(hdr->count, ARCH_CONVERT)
+					* sizeof(xfs_attr_leaf_entry_t)
+					+ sizeof(xfs_attr_leaf_hdr_t);
+	map = &hdr->freemap[0];
+	tmp = INT_GET(map->size, ARCH_CONVERT);
+	before = after = -1;
+	smallest = XFS_ATTR_LEAF_MAPSIZE - 1;
+	entsize = xfs_attr_leaf_entsize(leaf, args->index);
+	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) {
+		ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
+		ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
+		if (INT_GET(map->base, ARCH_CONVERT) == tablesize) {
+			INT_MOD(map->base, ARCH_CONVERT,
+					-sizeof(xfs_attr_leaf_entry_t));
+			INT_MOD(map->size, ARCH_CONVERT,
+					sizeof(xfs_attr_leaf_entry_t));
+		}
+
+		if ((INT_GET(map->base, ARCH_CONVERT)
+					+ INT_GET(map->size, ARCH_CONVERT))
+				== INT_GET(entry->nameidx, ARCH_CONVERT)) {
+			before = i;
+		} else if (INT_GET(map->base, ARCH_CONVERT)
+			== (INT_GET(entry->nameidx, ARCH_CONVERT) + entsize)) {
+			after = i;
+		} else if (INT_GET(map->size, ARCH_CONVERT) < tmp) {
+			tmp = INT_GET(map->size, ARCH_CONVERT);
+			smallest = i;
+		}
+	}
+
+	/*
+	 * Coalesce adjacent freemap regions,
+	 * or replace the smallest region.
+	 */
+	if ((before >= 0) || (after >= 0)) {
+		if ((before >= 0) && (after >= 0)) {
+			map = &hdr->freemap[before];
+			INT_MOD(map->size, ARCH_CONVERT, entsize);
+			INT_MOD(map->size, ARCH_CONVERT,
+				INT_GET(hdr->freemap[after].size,
+							ARCH_CONVERT));
+			INT_ZERO(hdr->freemap[after].base, ARCH_CONVERT);
+			INT_ZERO(hdr->freemap[after].size, ARCH_CONVERT);
+		} else if (before >= 0) {
+			map = &hdr->freemap[before];
+			INT_MOD(map->size, ARCH_CONVERT, entsize);
+		} else {
+			map = &hdr->freemap[after];
+			/* both on-disk, don't endian flip twice */
+			map->base = entry->nameidx;
+			INT_MOD(map->size, ARCH_CONVERT, entsize);
+		}
+	} else {
+		/*
+		 * Replace smallest region (if it is smaller than free'd entry)
+		 */
+		map = &hdr->freemap[smallest];
+		if (INT_GET(map->size, ARCH_CONVERT) < entsize) {
+			INT_SET(map->base, ARCH_CONVERT,
+					INT_GET(entry->nameidx, ARCH_CONVERT));
+			INT_SET(map->size, ARCH_CONVERT, entsize);
+		}
+	}
+
+	/*
+	 * Did we remove the first entry?
+	 */
+	if (INT_GET(entry->nameidx, ARCH_CONVERT)
+				== INT_GET(hdr->firstused, ARCH_CONVERT))
+		smallest = 1;
+	else
+		smallest = 0;
+
+	/*
+	 * Compress the remaining entries and zero out the removed stuff.
+	 */
+	bzero(XFS_ATTR_LEAF_NAME(leaf, args->index), entsize);
+	INT_MOD(hdr->usedbytes, ARCH_CONVERT, -entsize);
+	xfs_da_log_buf(args->trans, bp,
+	     XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index),
+				   entsize));
+
+	tmp = (INT_GET(hdr->count, ARCH_CONVERT) - args->index)
+					* sizeof(xfs_attr_leaf_entry_t);
+	ovbcopy((char *)(entry+1), (char *)entry, tmp);
+	INT_MOD(hdr->count, ARCH_CONVERT, -1);
+	xfs_da_log_buf(args->trans, bp,
+	    XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry)));
+	entry = &leaf->entries[INT_GET(hdr->count, ARCH_CONVERT)];
+	bzero((char *)entry, sizeof(xfs_attr_leaf_entry_t));
+
+	/*
+	 * If we removed the first entry, re-find the first used byte
+	 * in the name area.  Note that if the entry was the "firstused",
+	 * then we don't have a "hole" in our block resulting from
+	 * removing the name.
+	 */
+	if (smallest) {
+		tmp = XFS_LBSIZE(mp);
+		entry = &leaf->entries[0];
+		for (i = INT_GET(hdr->count, ARCH_CONVERT)-1;
+						i >= 0; entry++, i--) {
+			ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT)
+				>= INT_GET(hdr->firstused, ARCH_CONVERT));
+			ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT)
+							< XFS_LBSIZE(mp));
+			if (INT_GET(entry->nameidx, ARCH_CONVERT) < tmp)
+				tmp = INT_GET(entry->nameidx, ARCH_CONVERT);
+		}
+		INT_SET(hdr->firstused, ARCH_CONVERT, tmp);
+		if (INT_ISZERO(hdr->firstused, ARCH_CONVERT)) {
+			INT_SET(hdr->firstused, ARCH_CONVERT,
+					tmp - XFS_ATTR_LEAF_NAME_ALIGN);
+		}
+	} else {
+		hdr->holes = 1;		/* mark as needing compaction */
+	}
+	xfs_da_log_buf(args->trans, bp,
+			  XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
+
+	/*
+	 * Check if leaf is less than 50% full, caller may want to
+	 * "join" the leaf with a sibling if so.
+	 */
+	tmp  = sizeof(xfs_attr_leaf_hdr_t);
+	tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT)
+					* sizeof(xfs_attr_leaf_entry_t);
+	tmp += INT_GET(leaf->hdr.usedbytes, ARCH_CONVERT);
+	return(tmp < mp->m_attr_magicpct); /* leaf is < 37% full */
+}
+
+/*
+ * Move all the attribute list entries from drop_leaf into save_leaf.
+ */
+void
+xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
+				       xfs_da_state_blk_t *save_blk)
+{
+	xfs_attr_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf;
+	xfs_attr_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr;
+	xfs_mount_t *mp;
+	char *tmpbuffer;
+
+	/*
+	 * Set up environment.
+	 */
+	mp = state->mp;
+	ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(save_blk->magic == XFS_ATTR_LEAF_MAGIC);
+	drop_leaf = drop_blk->bp->data;
+	save_leaf = save_blk->bp->data;
+	ASSERT(INT_GET(drop_leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	ASSERT(INT_GET(save_leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	drop_hdr = &drop_leaf->hdr;
+	save_hdr = &save_leaf->hdr;
+
+	/*
+	 * Save last hashval from dying block for later Btree fixup.
+	 */
+	drop_blk->hashval =
+		INT_GET(drop_leaf->entries[INT_GET(drop_leaf->hdr.count,
+						ARCH_CONVERT)-1].hashval,
+								ARCH_CONVERT);
+
+	/*
+	 * Check if we need a temp buffer, or can we do it in place.
+	 * Note that we don't check "leaf" for holes because we will
+	 * always be dropping it, toosmall() decided that for us already.
+	 */
+	if (save_hdr->holes == 0) {
+		/*
+		 * dest leaf has no holes, so we add there.  May need
+		 * to make some room in the entry array.
+		 */
+		if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) {
+			xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, 0,
+			     (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
+		} else {
+			xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf,
+				  INT_GET(save_hdr->count, ARCH_CONVERT),
+				  (int)INT_GET(drop_hdr->count, ARCH_CONVERT),
+				  mp);
+		}
+	} else {
+		/*
+		 * Destination has holes, so we make a temporary copy
+		 * of the leaf and add them both to that.
+		 */
+		tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP);
+		ASSERT(tmpbuffer != NULL);
+		bzero(tmpbuffer, state->blocksize);
+		tmp_leaf = (xfs_attr_leafblock_t *)tmpbuffer;
+		tmp_hdr = &tmp_leaf->hdr;
+		tmp_hdr->info = save_hdr->info; /* struct copy */
+		INT_ZERO(tmp_hdr->count, ARCH_CONVERT);
+		INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize);
+		if (INT_ISZERO(tmp_hdr->firstused, ARCH_CONVERT)) {
+			INT_SET(tmp_hdr->firstused, ARCH_CONVERT,
+				state->blocksize - XFS_ATTR_LEAF_NAME_ALIGN);
+		}
+		INT_ZERO(tmp_hdr->usedbytes, ARCH_CONVERT);
+		if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) {
+			xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, 0,
+				(int)INT_GET(drop_hdr->count, ARCH_CONVERT),
+				mp);
+			xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf,
+				  INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
+				 (int)INT_GET(save_hdr->count, ARCH_CONVERT),
+				 mp);
+		} else {
+			xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, 0,
+				(int)INT_GET(save_hdr->count, ARCH_CONVERT),
+				mp);
+			xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf,
+				INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
+				(int)INT_GET(drop_hdr->count, ARCH_CONVERT),
+				mp);
+		}
+		bcopy((char *)tmp_leaf, (char *)save_leaf, state->blocksize);
+		kmem_free(tmpbuffer, state->blocksize);
+	}
+
+	xfs_da_log_buf(state->args->trans, save_blk->bp, 0,
+					   state->blocksize - 1);
+
+	/*
+	 * Copy out last hashval in each block for B-tree code.
+	 */
+	save_blk->hashval =
+		INT_GET(save_leaf->entries[INT_GET(save_leaf->hdr.count,
+						ARCH_CONVERT)-1].hashval,
+								ARCH_CONVERT);
+}
+
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+
+/*
+ * Look up a name in a leaf attribute list structure.
+ * This is the internal routine, it uses the caller's buffer.
+ *
+ * Note that duplicate keys are allowed, but only check within the
+ * current leaf node.  The Btree code must check in adjacent leaf nodes.
+ *
+ * Return in args->index the index into the entry[] array of either
+ * the found entry, or where the entry should have been (insert before
+ * that entry).
+ *
+ * Don't change the args->value unless we find the attribute.
+ */
+int
+xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_local_t *name_loc;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	int probe, span;
+	xfs_dahash_t hashval;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT)
+					< (XFS_LBSIZE(args->dp->i_mount)/8));
+
+	/*
+	 * Binary search.  (note: small blocks will skip this loop)
+	 */
+	hashval = args->hashval;
+	probe = span = INT_GET(leaf->hdr.count, ARCH_CONVERT) / 2;
+	for (entry = &leaf->entries[probe]; span > 4;
+		   entry = &leaf->entries[probe]) {
+		span /= 2;
+		if (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)
+			probe += span;
+		else if (INT_GET(entry->hashval, ARCH_CONVERT) > hashval)
+			probe -= span;
+		else
+			break;
+	}
+	ASSERT((probe >= 0) && \
+	       ((INT_ISZERO(leaf->hdr.count, ARCH_CONVERT))
+	       || (probe < INT_GET(leaf->hdr.count, ARCH_CONVERT))));
+	ASSERT((span <= 4) || (INT_GET(entry->hashval, ARCH_CONVERT)
+							== hashval));
+
+	/*
+	 * Since we may have duplicate hashval's, find the first matching
+	 * hashval in the leaf.
+	 */
+	while ((probe > 0) && (INT_GET(entry->hashval, ARCH_CONVERT)
+							>= hashval)) {
+		entry--;
+		probe--;
+	}
+	while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT))
+		&& (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)) {
+		entry++;
+		probe++;
+	}
+	if ((probe == INT_GET(leaf->hdr.count, ARCH_CONVERT))
+		    || (INT_GET(entry->hashval, ARCH_CONVERT) != hashval)) {
+		args->index = probe;
+		return(XFS_ERROR(ENOATTR));
+	}
+
+	/*
+	 * Duplicate keys may be present, so search all of them for a match.
+	 */
+	for (  ; (probe < INT_GET(leaf->hdr.count, ARCH_CONVERT))
+			&& (INT_GET(entry->hashval, ARCH_CONVERT) == hashval);
+			entry++, probe++) {
+/*
+ * GROT: Add code to remove incomplete entries.
+ */
+		/*
+		 * If we are looking for INCOMPLETE entries, show only those.
+		 * If we are looking for complete entries, show only those.
+		 */
+		if ((args->flags & XFS_ATTR_INCOMPLETE) !=
+		    (entry->flags & XFS_ATTR_INCOMPLETE)) {
+			continue;
+		}
+		if (entry->flags & XFS_ATTR_LOCAL) {
+			name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, probe);
+			if (name_loc->namelen != args->namelen)
+				continue;
+			if (bcmp(args->name, (char *)name_loc->nameval,
+					     args->namelen) != 0)
+				continue;
+			if (((args->flags & ATTR_ROOT) != 0) !=
+			    ((entry->flags & XFS_ATTR_ROOT) != 0))
+				continue;
+			args->index = probe;
+			return(XFS_ERROR(EEXIST));
+		} else {
+			name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, probe);
+			if (name_rmt->namelen != args->namelen)
+				continue;
+			if (bcmp(args->name, (char *)name_rmt->name,
+					     args->namelen) != 0)
+				continue;
+			if (((args->flags & ATTR_ROOT) != 0) !=
+			    ((entry->flags & XFS_ATTR_ROOT) != 0))
+				continue;
+			args->index = probe;
+			args->rmtblkno
+				  = INT_GET(name_rmt->valueblk, ARCH_CONVERT);
+			args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount,
+						   INT_GET(name_rmt->valuelen,
+								ARCH_CONVERT));
+			return(XFS_ERROR(EEXIST));
+		}
+	}
+	args->index = probe;
+	return(XFS_ERROR(ENOATTR));
+}
+
+/*
+ * Get the value associated with an attribute name from a leaf attribute
+ * list structure.
+ */
+int
+xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args)
+{
+	int valuelen;
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_local_t *name_loc;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT)
+					< (XFS_LBSIZE(args->dp->i_mount)/8));
+	ASSERT(args->index < ((int)INT_GET(leaf->hdr.count, ARCH_CONVERT)));
+
+	entry = &leaf->entries[args->index];
+	if (entry->flags & XFS_ATTR_LOCAL) {
+		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
+		ASSERT(name_loc->namelen == args->namelen);
+		ASSERT(bcmp(args->name, name_loc->nameval, args->namelen) == 0);
+		valuelen = INT_GET(name_loc->valuelen, ARCH_CONVERT);
+		if (args->flags & ATTR_KERNOVAL) {
+			args->valuelen = valuelen;
+			return(0);
+		}
+		if (args->valuelen < valuelen) {
+			args->valuelen = valuelen;
+			return(XFS_ERROR(ERANGE));
+		}
+		args->valuelen = valuelen;
+		bcopy(&name_loc->nameval[args->namelen], args->value, valuelen);
+	} else {
+		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+		ASSERT(name_rmt->namelen == args->namelen);
+		ASSERT(bcmp(args->name, name_rmt->name, args->namelen) == 0);
+		valuelen = INT_GET(name_rmt->valuelen, ARCH_CONVERT);
+		args->rmtblkno = INT_GET(name_rmt->valueblk, ARCH_CONVERT);
+		args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen);
+		if (args->flags & ATTR_KERNOVAL) {
+			args->valuelen = valuelen;
+			return(0);
+		}
+		if (args->valuelen < valuelen) {
+			args->valuelen = valuelen;
+			return(XFS_ERROR(ERANGE));
+		}
+		args->valuelen = valuelen;
+	}
+	return(0);
+}
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Move the indicated entries from one leaf to another.
+ * NOTE: this routine modifies both source and destination leaves.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
+			xfs_attr_leafblock_t *leaf_d, int start_d,
+			int count, xfs_mount_t *mp)
+{
+	xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
+	xfs_attr_leaf_entry_t *entry_s, *entry_d;
+	int desti, tmp, i;
+
+	/*
+	 * Check for nothing to do.
+	 */
+	if (count == 0)
+		return;
+
+	/*
+	 * Set up environment.
+	 */
+	ASSERT(INT_GET(leaf_s->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	ASSERT(INT_GET(leaf_d->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	hdr_s = &leaf_s->hdr;
+	hdr_d = &leaf_d->hdr;
+	ASSERT((INT_GET(hdr_s->count, ARCH_CONVERT) > 0)
+				&& (INT_GET(hdr_s->count, ARCH_CONVERT)
+						< (XFS_LBSIZE(mp)/8)));
+	ASSERT(INT_GET(hdr_s->firstused, ARCH_CONVERT) >=
+		((INT_GET(hdr_s->count, ARCH_CONVERT)
+					* sizeof(*entry_s))+sizeof(*hdr_s)));
+	ASSERT(INT_GET(hdr_d->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8));
+	ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >=
+		((INT_GET(hdr_d->count, ARCH_CONVERT)
+					* sizeof(*entry_d))+sizeof(*hdr_d)));
+
+	ASSERT(start_s < INT_GET(hdr_s->count, ARCH_CONVERT));
+	ASSERT(start_d <= INT_GET(hdr_d->count, ARCH_CONVERT));
+	ASSERT(count <= INT_GET(hdr_s->count, ARCH_CONVERT));
+
+	/*
+	 * Move the entries in the destination leaf up to make a hole?
+	 */
+	if (start_d < INT_GET(hdr_d->count, ARCH_CONVERT)) {
+		tmp  = INT_GET(hdr_d->count, ARCH_CONVERT) - start_d;
+		tmp *= sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &leaf_d->entries[start_d];
+		entry_d = &leaf_d->entries[start_d + count];
+		ovbcopy((char *)entry_s, (char *)entry_d, tmp);
+	}
+
+	/*
+	 * Copy all entry's in the same (sorted) order,
+	 * but allocate attribute info packed and in sequence.
+	 */
+	entry_s = &leaf_s->entries[start_s];
+	entry_d = &leaf_d->entries[start_d];
+	desti = start_d;
+	for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) {
+		ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT)
+				>= INT_GET(hdr_s->firstused, ARCH_CONVERT));
+		tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i);
+#ifdef GROT
+		/*
+		 * Code to drop INCOMPLETE entries.  Difficult to use as we
+		 * may also need to change the insertion index.	 Code turned
+		 * off for 6.2, should be revisited later.
+		 */
+		if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
+			bzero(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), tmp);
+			INT_MOD(hdr_s->usedbytes, ARCH_CONVERT, -tmp);
+			INT_MOD(hdr_s->count, ARCH_CONVERT, -1);
+			entry_d--;	/* to compensate for ++ in loop hdr */
+			desti--;
+			if ((start_s + i) < offset)
+				result++;	/* insertion index adjustment */
+		} else {
+#endif /* GROT */
+			INT_MOD(hdr_d->firstused, ARCH_CONVERT, -tmp);
+			/* both on-disk, don't endian flip twice */
+			entry_d->hashval = entry_s->hashval;
+			/* both on-disk, don't endian flip twice */
+			entry_d->nameidx = hdr_d->firstused;
+			entry_d->flags = entry_s->flags;
+			ASSERT(INT_GET(entry_d->nameidx, ARCH_CONVERT) + tmp
+							<= XFS_LBSIZE(mp));
+			ovbcopy(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i),
+			      XFS_ATTR_LEAF_NAME(leaf_d, desti), tmp);
+			ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) + tmp
+							<= XFS_LBSIZE(mp));
+			bzero(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), tmp);
+			INT_MOD(hdr_s->usedbytes, ARCH_CONVERT, -tmp);
+			INT_MOD(hdr_d->usedbytes, ARCH_CONVERT, tmp);
+			INT_MOD(hdr_s->count, ARCH_CONVERT, -1);
+			INT_MOD(hdr_d->count, ARCH_CONVERT, 1);
+			tmp = INT_GET(hdr_d->count, ARCH_CONVERT)
+						* sizeof(xfs_attr_leaf_entry_t)
+						+ sizeof(xfs_attr_leaf_hdr_t);
+			ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >= tmp);
+#ifdef GROT
+		}
+#endif /* GROT */
+	}
+
+	/*
+	 * Zero out the entries we just copied.
+	 */
+	if (start_s == INT_GET(hdr_s->count, ARCH_CONVERT)) {
+		tmp = count * sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &leaf_s->entries[start_s];
+		ASSERT(((char *)entry_s + tmp) <=
+		       ((char *)leaf_s + XFS_LBSIZE(mp)));
+		bzero((char *)entry_s, tmp);
+	} else {
+		/*
+		 * Move the remaining entries down to fill the hole,
+		 * then zero the entries at the top.
+		 */
+		tmp  = INT_GET(hdr_s->count, ARCH_CONVERT) - count;
+		tmp *= sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &leaf_s->entries[start_s + count];
+		entry_d = &leaf_s->entries[start_s];
+		ovbcopy((char *)entry_s, (char *)entry_d, tmp);
+
+		tmp = count * sizeof(xfs_attr_leaf_entry_t);
+		entry_s = &leaf_s->entries[INT_GET(hdr_s->count,
+							ARCH_CONVERT)];
+		ASSERT(((char *)entry_s + tmp) <=
+		       ((char *)leaf_s + XFS_LBSIZE(mp)));
+		bzero((char *)entry_s, tmp);
+	}
+
+	/*
+	 * Fill in the freemap information
+	 */
+	INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT,
+					sizeof(xfs_attr_leaf_hdr_t));
+	INT_MOD(hdr_d->freemap[0].base, ARCH_CONVERT,
+				INT_GET(hdr_d->count, ARCH_CONVERT)
+					* sizeof(xfs_attr_leaf_entry_t));
+	INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT,
+				INT_GET(hdr_d->firstused, ARCH_CONVERT)
+			      - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
+	INT_ZERO(hdr_d->freemap[1].base, ARCH_CONVERT);
+	INT_ZERO(hdr_d->freemap[2].base, ARCH_CONVERT);
+	INT_ZERO(hdr_d->freemap[1].size, ARCH_CONVERT);
+	INT_ZERO(hdr_d->freemap[2].size, ARCH_CONVERT);
+	hdr_s->holes = 1;	/* leaf may not be compact */
+}
+
+/*
+ * Compare two leaf blocks "order".
+ * Return 0 unless leaf2 should go before leaf1.
+ */
+int
+xfs_attr_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp)
+{
+	xfs_attr_leafblock_t *leaf1, *leaf2;
+
+	leaf1 = leaf1_bp->data;
+	leaf2 = leaf2_bp->data;
+	ASSERT((INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC) &&
+	       (INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC));
+	if (   (INT_GET(leaf1->hdr.count, ARCH_CONVERT) > 0)
+	    && (INT_GET(leaf2->hdr.count, ARCH_CONVERT) > 0)
+	    && (   (INT_GET(leaf2->entries[ 0 ].hashval, ARCH_CONVERT) <
+		      INT_GET(leaf1->entries[ 0 ].hashval, ARCH_CONVERT))
+		|| (INT_GET(leaf2->entries[INT_GET(leaf2->hdr.count,
+				ARCH_CONVERT)-1].hashval, ARCH_CONVERT) <
+		      INT_GET(leaf1->entries[INT_GET(leaf1->hdr.count,
+				ARCH_CONVERT)-1].hashval, ARCH_CONVERT))) ) {
+		return(1);
+	}
+	return(0);
+}
+
+/*
+ * Pick up the last hashvalue from a leaf block.
+ */
+xfs_dahash_t
+xfs_attr_leaf_lasthash(xfs_dabuf_t *bp, int *count)
+{
+	xfs_attr_leafblock_t *leaf;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	if (count)
+		*count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+	if (INT_ISZERO(leaf->hdr.count, ARCH_CONVERT))
+		return(0);
+	return(INT_GET(leaf->entries[INT_GET(leaf->hdr.count,
+				ARCH_CONVERT)-1].hashval, ARCH_CONVERT));
+}
+
+/*
+ * Calculate the number of bytes used to store the indicated attribute
+ * (whether local or remote only calculate bytes in this block).
+ */
+int
+xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
+{
+	xfs_attr_leaf_name_local_t *name_loc;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	int size;
+
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	if (leaf->entries[index].flags & XFS_ATTR_LOCAL) {
+		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, index);
+		size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(name_loc->namelen,
+						   INT_GET(name_loc->valuelen,
+								ARCH_CONVERT));
+	} else {
+		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, index);
+		size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(name_rmt->namelen);
+	}
+	return(size);
+}
+
+/*
+ * Calculate the number of bytes that would be required to store the new
+ * attribute (whether local or remote only calculate bytes in this block).
+ * This routine decides as a side effect whether the attribute will be
+ * a "local" or a "remote" attribute.
+ */
+int
+xfs_attr_leaf_newentsize(xfs_da_args_t *args, int blocksize, int *local)
+{
+	int size;
+
+	size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(args->namelen, args->valuelen);
+	if (size < XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(blocksize)) {
+		if (local) {
+			*local = 1;
+		}
+	} else {
+		size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(args->namelen);
+		if (local) {
+			*local = 0;
+		}
+	}
+	return(size);
+}
+
+/*
+ * Copy out attribute list entries for attr_list(), for leaf attribute lists.
+ */
+int
+xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
+{
+	attrlist_cursor_kern_t *cursor;
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_local_t *name_loc;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	int retval, i;
+
+	ASSERT(bp != NULL);
+	leaf = bp->data;
+	cursor = context->cursor;
+	cursor->initted = 1;
+
+	xfs_attr_trace_l_cl("blk start", context, leaf);
+
+	/*
+	 * Re-find our place in the leaf block if this is a new syscall.
+	 */
+	if (context->resynch) {
+		entry = &leaf->entries[0];
+		for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT);
+							entry++, i++) {
+			if (INT_GET(entry->hashval, ARCH_CONVERT)
+							== cursor->hashval) {
+				if (cursor->offset == context->dupcnt) {
+					context->dupcnt = 0;
+					break;
+				}
+				context->dupcnt++;
+			} else if (INT_GET(entry->hashval, ARCH_CONVERT)
+							> cursor->hashval) {
+				context->dupcnt = 0;
+				break;
+			}
+		}
+		if (i == INT_GET(leaf->hdr.count, ARCH_CONVERT)) {
+			xfs_attr_trace_l_c("not found", context);
+			return(0);
+		}
+	} else {
+		entry = &leaf->entries[0];
+		i = 0;
+	}
+	context->resynch = 0;
+
+	/*
+	 * We have found our place, start copying out the new attributes.
+	 */
+	retval = 0;
+	for (  ; (i < INT_GET(leaf->hdr.count, ARCH_CONVERT))
+	     && (retval == 0); entry++, i++) {
+		int ns = (entry->flags & XFS_ATTR_ROOT)? ROOT_NAMES:USER_NAMES;
+
+		if (INT_GET(entry->hashval, ARCH_CONVERT) != cursor->hashval) {
+			cursor->hashval = INT_GET(entry->hashval, ARCH_CONVERT);
+			cursor->offset = 0;
+		}
+
+		if (entry->flags & XFS_ATTR_INCOMPLETE)
+			continue;		/* skip incomplete entries */
+		if (((context->flags & ATTR_ROOT) != 0) !=
+		    ((entry->flags & XFS_ATTR_ROOT) != 0) &&
+		    !(context->flags & ATTR_KERNFULLS))
+			continue;		/* skip non-matching entries */
+
+		if (entry->flags & XFS_ATTR_LOCAL) {
+			name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
+			if (context->flags & ATTR_KERNOVAL) {
+				ASSERT(context->flags & ATTR_KERNAMELS);
+				context->count += xfs_namespaces[ns].namelen
+						+ (int)name_loc->namelen + 1;
+			} else {
+				retval = xfs_attr_put_listent(context, ns,
+					(char *)name_loc->nameval,
+					(int)name_loc->namelen,
+					(int)INT_GET(name_loc->valuelen,
+								ARCH_CONVERT));
+			}
+		} else {
+			name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
+			if (context->flags & ATTR_KERNOVAL) {
+				ASSERT(context->flags & ATTR_KERNAMELS);
+				context->count += xfs_namespaces[ns].namelen
+						+ (int)name_rmt->namelen + 1;
+			} else {
+				retval = xfs_attr_put_listent(context, ns,
+					(char *)name_rmt->name,
+					(int)name_rmt->namelen,
+					(int)INT_GET(name_rmt->valuelen,
+								ARCH_CONVERT));
+			}
+		}
+		if (retval == 0) {
+			cursor->offset++;
+		}
+	}
+	xfs_attr_trace_l_cl("blk end", context, leaf);
+	return(retval);
+}
+
+#define ATTR_ENTBASESIZE		/* minimum bytes used by an attr */ \
+	(((struct attrlist_ent *) 0)->a_name - (char *) 0)
+#define ATTR_ENTSIZE(namelen)		/* actual bytes used by an attr */ \
+	((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
+	 & ~(sizeof(u_int32_t)-1))
+
+/*
+ * Format an attribute and copy it out to the user's buffer.
+ * Take care to check values and protect against them changing later,
+ * we may be reading them directly out of a user buffer.
+ */
+/*ARGSUSED*/
+int
+xfs_attr_put_listent(xfs_attr_list_context_t *context,
+		     int ns, char *name, int namelen, int valuelen)
+{
+	attrlist_ent_t *aep;
+	int arraytop;
+
+	ASSERT(!(context->flags & ATTR_KERNOVAL));
+	if (context->flags & ATTR_KERNAMELS) {
+		char *offset;
+		xattr_namespace_t *nsp;
+
+		ASSERT(context->count >= 0);
+
+		nsp = &xfs_namespaces[ns];
+		arraytop = context->count + nsp->namelen + namelen+1;
+		if (arraytop > context->firstu) {
+			context->count = -1;	/* insufficient space */
+			return(1);
+		}
+		offset = (char *)context->alist + context->count;
+		strncpy(offset, nsp->name, nsp->namelen);	/* namespace */
+		offset += nsp->namelen;
+		strncpy(offset, name, namelen);			/* real name */
+		offset += namelen;
+		*offset = '\0';
+		context->count += nsp->namelen + namelen + 1;
+		return(0);
+	}
+
+	ASSERT(context->count >= 0);
+	ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
+	ASSERT(context->firstu >= sizeof(*context->alist));
+	ASSERT(context->firstu <= context->bufsize);
+
+	arraytop = sizeof(*context->alist) +
+			context->count * sizeof(context->alist->al_offset[0]);
+	context->firstu -= ATTR_ENTSIZE(namelen);
+	if (context->firstu < arraytop) {
+		xfs_attr_trace_l_c("buffer full", context);
+		context->alist->al_more = 1;
+		return(1);
+	}
+
+	aep = (attrlist_ent_t *)&(((char *)context->alist)[ context->firstu ]);
+	aep->a_valuelen = valuelen;
+	bcopy(name, aep->a_name, namelen);
+	aep->a_name[ namelen ] = 0;
+	context->alist->al_offset[ context->count++ ] = context->firstu;
+	context->alist->al_count = context->count;
+	xfs_attr_trace_l_c("add", context);
+	return(0);
+}
+
+/*========================================================================
+ * Manage the INCOMPLETE flag in a leaf entry
+ *========================================================================*/
+
+/*
+ * Clear the INCOMPLETE flag on an entry in a leaf block.
+ */
+int
+xfs_attr_leaf_clearflag(xfs_da_args_t *args)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	xfs_dabuf_t *bp;
+	int error;
+#ifdef DEBUG
+	xfs_attr_leaf_name_local_t *name_loc;
+	int namelen;
+	char *name;
+#endif /* DEBUG */
+
+	/*
+	 * Set up the operation.
+	 */
+	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+					     XFS_ATTR_FORK);
+	if (error) {
+		return(error);
+	}
+	ASSERT(bp != NULL);
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	ASSERT(args->index < INT_GET(leaf->hdr.count, ARCH_CONVERT));
+	ASSERT(args->index >= 0);
+	entry = &leaf->entries[ args->index ];
+	ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
+
+#ifdef DEBUG
+	if (entry->flags & XFS_ATTR_LOCAL) {
+		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
+		namelen = name_loc->namelen;
+		name = (char *)name_loc->nameval;
+	} else {
+		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+		namelen = name_rmt->namelen;
+		name = (char *)name_rmt->name;
+	}
+	ASSERT(INT_GET(entry->hashval, ARCH_CONVERT) == args->hashval);
+	ASSERT(namelen == args->namelen);
+	ASSERT(bcmp(name, args->name, namelen) == 0);
+#endif /* DEBUG */
+
+	entry->flags &= ~XFS_ATTR_INCOMPLETE;
+	xfs_da_log_buf(args->trans, bp,
+			 XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+
+	if (args->rmtblkno) {
+		ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
+		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+		INT_SET(name_rmt->valueblk, ARCH_CONVERT, args->rmtblkno);
+		INT_SET(name_rmt->valuelen, ARCH_CONVERT, args->valuelen);
+		xfs_da_log_buf(args->trans, bp,
+			 XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
+	}
+	xfs_da_buf_done(bp);
+
+	/*
+	 * Commit the flag value change and start the next trans in series.
+	 */
+	error = xfs_attr_rolltrans(&args->trans, args->dp);
+
+	return(error);
+}
+
+/*
+ * Set the INCOMPLETE flag on an entry in a leaf block.
+ */
+int
+xfs_attr_leaf_setflag(xfs_da_args_t *args)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	xfs_dabuf_t *bp;
+	int error;
+
+	/*
+	 * Set up the operation.
+	 */
+	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+					     XFS_ATTR_FORK);
+	if (error) {
+		return(error);
+	}
+	ASSERT(bp != NULL);
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	ASSERT(args->index < INT_GET(leaf->hdr.count, ARCH_CONVERT));
+	ASSERT(args->index >= 0);
+	entry = &leaf->entries[ args->index ];
+
+	ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0);
+	entry->flags |= XFS_ATTR_INCOMPLETE;
+	xfs_da_log_buf(args->trans, bp,
+			XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+	if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
+		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+		INT_ZERO(name_rmt->valueblk, ARCH_CONVERT);
+		INT_ZERO(name_rmt->valuelen, ARCH_CONVERT);
+		xfs_da_log_buf(args->trans, bp,
+			 XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
+	}
+	xfs_da_buf_done(bp);
+
+	/*
+	 * Commit the flag value change and start the next trans in series.
+	 */
+	error = xfs_attr_rolltrans(&args->trans, args->dp);
+
+	return(error);
+}
+
+/*
+ * In a single transaction, clear the INCOMPLETE flag on the leaf entry
+ * given by args->blkno/index and set the INCOMPLETE flag on the leaf
+ * entry given by args->blkno2/index2.
+ *
+ * Note that they could be in different blocks, or in the same block.
+ */
+int
+xfs_attr_leaf_flipflags(xfs_da_args_t *args)
+{
+	xfs_attr_leafblock_t *leaf1, *leaf2;
+	xfs_attr_leaf_entry_t *entry1, *entry2;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	xfs_dabuf_t *bp1, *bp2;
+	int error;
+#ifdef DEBUG
+	xfs_attr_leaf_name_local_t *name_loc;
+	int namelen1, namelen2;
+	char *name1, *name2;
+#endif /* DEBUG */
+
+	/*
+	 * Read the block containing the "old" attr
+	 */
+	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1,
+					     XFS_ATTR_FORK);
+	if (error) {
+		return(error);
+	}
+	ASSERT(bp1 != NULL);
+
+	/*
+	 * Read the block containing the "new" attr, if it is different
+	 */
+	if (args->blkno2 != args->blkno) {
+		error = xfs_da_read_buf(args->trans, args->dp, args->blkno2,
+					-1, &bp2, XFS_ATTR_FORK);
+		if (error) {
+			return(error);
+		}
+		ASSERT(bp2 != NULL);
+	} else {
+		bp2 = bp1;
+	}
+
+	leaf1 = bp1->data;
+	ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	ASSERT(args->index < INT_GET(leaf1->hdr.count, ARCH_CONVERT));
+	ASSERT(args->index >= 0);
+	entry1 = &leaf1->entries[ args->index ];
+
+	leaf2 = bp2->data;
+	ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+	ASSERT(args->index2 < INT_GET(leaf2->hdr.count, ARCH_CONVERT));
+	ASSERT(args->index2 >= 0);
+	entry2 = &leaf2->entries[ args->index2 ];
+
+#ifdef DEBUG
+	if (entry1->flags & XFS_ATTR_LOCAL) {
+		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf1, args->index);
+		namelen1 = name_loc->namelen;
+		name1 = (char *)name_loc->nameval;
+	} else {
+		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index);
+		namelen1 = name_rmt->namelen;
+		name1 = (char *)name_rmt->name;
+	}
+	if (entry2->flags & XFS_ATTR_LOCAL) {
+		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf2, args->index2);
+		namelen2 = name_loc->namelen;
+		name2 = (char *)name_loc->nameval;
+	} else {
+		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2);
+		namelen2 = name_rmt->namelen;
+		name2 = (char *)name_rmt->name;
+	}
+	ASSERT(INT_GET(entry1->hashval, ARCH_CONVERT) == INT_GET(entry2->hashval, ARCH_CONVERT));
+	ASSERT(namelen1 == namelen2);
+	ASSERT(bcmp(name1, name2, namelen1) == 0);
+#endif /* DEBUG */
+
+	ASSERT(entry1->flags & XFS_ATTR_INCOMPLETE);
+	ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0);
+
+	entry1->flags &= ~XFS_ATTR_INCOMPLETE;
+	xfs_da_log_buf(args->trans, bp1,
+			  XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
+	if (args->rmtblkno) {
+		ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
+		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index);
+		INT_SET(name_rmt->valueblk, ARCH_CONVERT, args->rmtblkno);
+		INT_SET(name_rmt->valuelen, ARCH_CONVERT, args->valuelen);
+		xfs_da_log_buf(args->trans, bp1,
+			 XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
+	}
+
+	entry2->flags |= XFS_ATTR_INCOMPLETE;
+	xfs_da_log_buf(args->trans, bp2,
+			  XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
+	if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
+		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2);
+		INT_ZERO(name_rmt->valueblk, ARCH_CONVERT);
+		INT_ZERO(name_rmt->valuelen, ARCH_CONVERT);
+		xfs_da_log_buf(args->trans, bp2,
+			 XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt)));
+	}
+	xfs_da_buf_done(bp1);
+	if (bp1 != bp2)
+		xfs_da_buf_done(bp2);
+
+	/*
+	 * Commit the flag value change and start the next trans in series.
+	 */
+	error = xfs_attr_rolltrans(&args->trans, args->dp);
+
+	return(error);
+}
+
+/*========================================================================
+ * Indiscriminately delete the entire attribute fork
+ *========================================================================*/
+
+/*
+ * Recurse (gasp!) through the attribute nodes until we find leaves.
+ * We're doing a depth-first traversal in order to invalidate everything.
+ */
+int
+xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
+{
+	xfs_da_blkinfo_t *info;
+	xfs_daddr_t blkno;
+	xfs_dabuf_t *bp;
+	int error;
+
+	/*
+	 * Read block 0 to see what we have to work with.
+	 * We only get here if we have extents, since we remove
+	 * the extents in reverse order the extent containing
+	 * block 0 must still be there.
+	 */
+	error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
+	if (error)
+		return(error);
+	blkno = xfs_da_blkno(bp);
+
+	/*
+	 * Invalidate the tree, even if the "tree" is only a single leaf block.
+	 * This is a depth-first traversal!
+	 */
+	info = bp->data;
+	if (INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) {
+		error = xfs_attr_node_inactive(trans, dp, bp, 1);
+	} else if (INT_GET(info->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC) {
+		error = xfs_attr_leaf_inactive(trans, dp, bp);
+	} else {
+		error = XFS_ERROR(EIO);
+		xfs_da_brelse(*trans, bp);
+	}
+	if (error)
+		return(error);
+
+	/*
+	 * Invalidate the incore copy of the root block.
+	 */
+	error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
+	if (error)
+		return(error);
+	xfs_da_binval(*trans, bp);	/* remove from cache */
+	/*
+	 * Commit the invalidate and start the next transaction.
+	 */
+	error = xfs_attr_rolltrans(trans, dp);
+
+	return (error);
+}
+
+/*
+ * Recurse (gasp!) through the attribute nodes until we find leaves.
+ * We're doing a depth-first traversal in order to invalidate everything.
+ */
+int
+xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp,
+				   int level)
+{
+	xfs_da_blkinfo_t *info;
+	xfs_da_intnode_t *node;
+	xfs_dablk_t child_fsb;
+	xfs_daddr_t parent_blkno, child_blkno;
+	int error, count, i;
+	xfs_dabuf_t *child_bp;
+
+	/*
+	 * Since this code is recursive (gasp!) we must protect ourselves.
+	 */
+	if (level > XFS_DA_NODE_MAXDEPTH) {
+		xfs_da_brelse(*trans, bp);	/* no locks for later trans */
+		return(XFS_ERROR(EIO));
+	}
+
+	node = bp->data;
+	ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT)
+						== XFS_DA_NODE_MAGIC);
+	parent_blkno = xfs_da_blkno(bp);	/* save for re-read later */
+	count = INT_GET(node->hdr.count, ARCH_CONVERT);
+	if (!count) {
+		xfs_da_brelse(*trans, bp);
+		return(0);
+	}
+	child_fsb = INT_GET(node->btree[0].before, ARCH_CONVERT);
+	xfs_da_brelse(*trans, bp);	/* no locks for later trans */
+
+	/*
+	 * If this is the node level just above the leaves, simply loop
+	 * over the leaves removing all of them.  If this is higher up
+	 * in the tree, recurse downward.
+	 */
+	for (i = 0; i < count; i++) {
+		/*
+		 * Read the subsidiary block to see what we have to work with.
+		 * Don't do this in a transaction.  This is a depth-first
+		 * traversal of the tree so we may deal with many blocks
+		 * before we come back to this one.
+		 */
+		error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp,
+						XFS_ATTR_FORK);
+		if (error)
+			return(error);
+		if (child_bp) {
+						/* save for re-read later */
+			child_blkno = xfs_da_blkno(child_bp);
+
+			/*
+			 * Invalidate the subtree, however we have to.
+			 */
+			info = child_bp->data;
+			if (INT_GET(info->magic, ARCH_CONVERT)
+							== XFS_DA_NODE_MAGIC) {
+				error = xfs_attr_node_inactive(trans, dp,
+						child_bp, level+1);
+			} else if (INT_GET(info->magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC) {
+				error = xfs_attr_leaf_inactive(trans, dp,
+						child_bp);
+			} else {
+				error = XFS_ERROR(EIO);
+				xfs_da_brelse(*trans, child_bp);
+			}
+			if (error)
+				return(error);
+
+			/*
+			 * Remove the subsidiary block from the cache
+			 * and from the log.
+			 */
+			error = xfs_da_get_buf(*trans, dp, 0, child_blkno,
+				&child_bp, XFS_ATTR_FORK);
+			if (error)
+				return(error);
+			xfs_da_binval(*trans, child_bp);
+		}
+
+		/*
+		 * If we're not done, re-read the parent to get the next
+		 * child block number.
+		 */
+		if ((i+1) < count) {
+			error = xfs_da_read_buf(*trans, dp, 0, parent_blkno,
+				&bp, XFS_ATTR_FORK);
+			if (error)
+				return(error);
+			child_fsb = INT_GET(node->btree[i+1].before, ARCH_CONVERT);
+			xfs_da_brelse(*trans, bp);
+		}
+		/*
+		 * Atomically commit the whole invalidate stuff.
+		 */
+		if ((error = xfs_attr_rolltrans(trans, dp)))
+			return (error);
+	}
+
+	return(0);
+}
+
+/*
+ * Invalidate all of the "remote" value regions pointed to by a particular
+ * leaf block.
+ * Note that we must release the lock on the buffer so that we are not
+ * caught holding something that the logging code wants to flush to disk.
+ */
+int
+xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp)
+{
+	xfs_attr_leafblock_t *leaf;
+	xfs_attr_leaf_entry_t *entry;
+	xfs_attr_leaf_name_remote_t *name_rmt;
+	xfs_attr_inactive_list_t *list, *lp;
+	int error, count, size, tmp, i;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT)
+						== XFS_ATTR_LEAF_MAGIC);
+
+	/*
+	 * Count the number of "remote" value extents.
+	 */
+	count = 0;
+	entry = &leaf->entries[0];
+	for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) {
+		if (   INT_GET(entry->nameidx, ARCH_CONVERT)
+		    && ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
+			name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
+			if (!INT_ISZERO(name_rmt->valueblk, ARCH_CONVERT))
+				count++;
+		}
+	}
+
+	/*
+	 * If there are no "remote" values, we're done.
+	 */
+	if (count == 0) {
+		xfs_da_brelse(*trans, bp);
+		return(0);
+	}
+
+	/*
+	 * Allocate storage for a list of all the "remote" value extents.
+	 */
+	size = count * sizeof(xfs_attr_inactive_list_t);
+	list = (xfs_attr_inactive_list_t *)kmem_alloc(size, KM_SLEEP);
+
+	/*
+	 * Identify each of the "remote" value extents.
+	 */
+	lp = list;
+	entry = &leaf->entries[0];
+	for (i = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); entry++, i++) {
+		if (   INT_GET(entry->nameidx, ARCH_CONVERT)
+		    && ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
+			name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
+			if (!INT_ISZERO(name_rmt->valueblk, ARCH_CONVERT)) {
+				/* both on-disk, don't endian flip twice */
+				lp->valueblk = name_rmt->valueblk;
+				INT_SET(lp->valuelen, ARCH_CONVERT,
+						XFS_B_TO_FSB(dp->i_mount,
+						    INT_GET(name_rmt->valuelen,
+							      ARCH_CONVERT)));
+				lp++;
+			}
+		}
+	}
+	xfs_da_brelse(*trans, bp);	/* unlock for trans. in freextent() */
+
+	/*
+	 * Invalidate each of the "remote" value extents.
+	 */
+	error = 0;
+	for (lp = list, i = 0; i < count; i++, lp++) {
+		tmp = xfs_attr_leaf_freextent(trans, dp,
+						     INT_GET(lp->valueblk,
+								ARCH_CONVERT),
+						     INT_GET(lp->valuelen,
+								ARCH_CONVERT));
+		if (error == 0)
+			error = tmp;	/* save only the 1st errno */
+	}
+
+	kmem_free((xfs_caddr_t)list, size);
+	return(error);
+}
+
+/*
+ * Look at all the extents for this logical region,
+ * invalidate any buffers that are incore/in transactions.
+ */
+int
+xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
+				    xfs_dablk_t blkno, int blkcnt)
+{
+	xfs_bmbt_irec_t map;
+	xfs_dablk_t tblkno;
+	int tblkcnt, dblkcnt, nmap, error;
+	xfs_daddr_t dblkno;
+	xfs_buf_t *bp;
+
+	/*
+	 * Roll through the "value", invalidating the attribute value's
+	 * blocks.
+	 */
+	tblkno = blkno;
+	tblkcnt = blkcnt;
+	while (tblkcnt > 0) {
+		/*
+		 * Try to remember where we decided to put the value.
+		 */
+		nmap = 1;
+		error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt,
+					XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
+					NULL, 0, &map, &nmap, NULL);
+		if (error) {
+			return(error);
+		}
+		ASSERT(nmap == 1);
+		ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+
+		/*
+		 * If it's a hole, these are already unmapped
+		 * so there's nothing to invalidate.
+		 */
+		if (map.br_startblock != HOLESTARTBLOCK) {
+
+			dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
+						  map.br_startblock);
+			dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
+						map.br_blockcount);
+			bp = xfs_trans_get_buf(*trans,
+					dp->i_mount->m_ddev_targp,
+					dblkno, dblkcnt, 0);
+			xfs_trans_binval(*trans, bp);
+			/*
+			 * Roll to next transaction.
+			 */
+			if ((error = xfs_attr_rolltrans(trans, dp)))
+				return (error);
+		}
+
+		tblkno += map.br_blockcount;
+		tblkcnt -= map.br_blockcount;
+	}
+
+	return(0);
+}
+
+
+/*
+ * Roll from one trans in the sequence of PERMANENT transactions to the next.
+ */
+int
+xfs_attr_rolltrans(xfs_trans_t **transp, xfs_inode_t *dp)
+{
+	xfs_trans_t *trans;
+	unsigned int logres, count;
+	int	error;
+
+	/*
+	 * Ensure that the inode is always logged.
+	 */
+	trans = *transp;
+	xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
+
+	/*
+	 * Copy the critical parameters from one trans to the next.
+	 */
+	logres = trans->t_log_res;
+	count = trans->t_log_count;
+	*transp = xfs_trans_dup(trans);
+
+	/*
+	 * Commit the current transaction.
+	 * If this commit failed, then it'd just unlock those items that
+	 * are not marked ihold. That also means that a filesystem shutdown
+	 * is in progress. The caller takes the responsibility to cancel
+	 * the duplicate transaction that gets returned.
+	 */
+	if ((error = xfs_trans_commit(trans, 0, NULL)))
+		return (error);
+
+	trans = *transp;
+
+	/*
+	 * Reserve space in the log for th next transaction.
+	 * This also pushes items in the "AIL", the list of logged items,
+	 * out to disk if they are taking up space at the tail of the log
+	 * that we want to use.	 This requires that either nothing be locked
+	 * across this call, or that anything that is locked be logged in
+	 * the prior and the next transactions.
+	 */
+	error = xfs_trans_reserve(trans, 0, logres, 0,
+				  XFS_TRANS_PERM_LOG_RES, count);
+	/*
+	 *  Ensure that the inode is in the new transaction and locked.
+	 */
+	if (!error) {
+		xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
+		xfs_trans_ihold(trans, dp);
+	}
+	return (error);
+
+}
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
new file mode 100644
index 000000000000..8d301be59c45
--- /dev/null
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ATTR_LEAF_H__
+#define __XFS_ATTR_LEAF_H__
+
+/*
+ * Attribute storage layout, internal structure, access macros, etc.
+ *
+ * Attribute lists are structured around Btrees where all the data
+ * elements are in the leaf nodes.  Attribute names are hashed into an int,
+ * then that int is used as the index into the Btree.  Since the hashval
+ * of an attribute name may not be unique, we may have duplicate keys.	The
+ * internal links in the Btree are logical block offsets into the file.
+ */
+
+struct attrlist;
+struct attrlist_cursor_kern;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_da_state;
+struct xfs_da_state_blk;
+struct xfs_inode;
+struct xfs_trans;
+
+/*========================================================================
+ * Attribute structure when equal to XFS_LBSIZE(mp) bytes.
+ *========================================================================*/
+
+/*
+ * This is the structure of the leaf nodes in the Btree.
+ *
+ * Struct leaf_entry's are packed from the top.	 Name/values grow from the
+ * bottom but are not packed.  The freemap contains run-length-encoded entries
+ * for the free bytes after the leaf_entry's, but only the N largest such,
+ * smaller runs are dropped.  When the freemap doesn't show enough space
+ * for an allocation, we compact the name/value area and try again.  If we
+ * still don't have enough space, then we have to split the block.  The
+ * name/value structs (both local and remote versions) must be 32bit aligned.
+ *
+ * Since we have duplicate hash keys, for each key that matches, compare
+ * the actual name string.  The root and intermediate node search always
+ * takes the first-in-the-block key match found, so we should only have
+ * to work "forw"ard.  If none matches, continue with the "forw"ard leaf
+ * nodes until the hash key changes or the attribute name is found.
+ *
+ * We store the fact that an attribute is a ROOT versus USER attribute in
+ * the leaf_entry.  The namespaces are independent only because we also look
+ * at the root/user bit when we are looking for a matching attribute name.
+ *
+ * We also store a "incomplete" bit in the leaf_entry.	It shows that an
+ * attribute is in the middle of being created and should not be shown to
+ * the user if we crash during the time that the bit is set.  We clear the
+ * bit when we have finished setting up the attribute.	We do this because
+ * we cannot create some large attributes inside a single transaction, and we
+ * need some indication that we weren't finished if we crash in the middle.
+ */
+#define XFS_ATTR_LEAF_MAPSIZE	3	/* how many freespace slots */
+
+typedef struct xfs_attr_leafblock {
+	struct xfs_attr_leaf_hdr {	/* constant-structure header block */
+		xfs_da_blkinfo_t info;	/* block type, links, etc. */
+		__uint16_t count;	/* count of active leaf_entry's */
+		__uint16_t usedbytes;	/* num bytes of names/values stored */
+		__uint16_t firstused;	/* first used byte in name area */
+		__uint8_t  holes;	/* != 0 if blk needs compaction */
+		__uint8_t  pad1;
+		struct xfs_attr_leaf_map {	  /* RLE map of free bytes */
+			__uint16_t base;	  /* base of free region */
+			__uint16_t size;	  /* length of free region */
+		} freemap[XFS_ATTR_LEAF_MAPSIZE]; /* N largest free regions */
+	} hdr;
+	struct xfs_attr_leaf_entry {	/* sorted on key, not name */
+		xfs_dahash_t hashval;	/* hash value of name */
+		__uint16_t nameidx;	/* index into buffer of name/value */
+		__uint8_t flags;	/* LOCAL, ROOT and INCOMPLETE flags */
+		__uint8_t pad2;		/* unused pad byte */
+	} entries[1];			/* variable sized array */
+	struct xfs_attr_leaf_name_local {
+		__uint16_t valuelen;	/* number of bytes in value */
+		__uint8_t namelen;	/* length of name bytes */
+		__uint8_t nameval[1];	/* name/value bytes */
+	} namelist;			/* grows from bottom of buf */
+	struct xfs_attr_leaf_name_remote {
+		xfs_dablk_t valueblk;	/* block number of value bytes */
+		__uint32_t valuelen;	/* number of bytes in value */
+		__uint8_t namelen;	/* length of name bytes */
+		__uint8_t name[1];	/* name bytes */
+	} valuelist;			/* grows from bottom of buf */
+} xfs_attr_leafblock_t;
+typedef struct xfs_attr_leaf_hdr xfs_attr_leaf_hdr_t;
+typedef struct xfs_attr_leaf_map xfs_attr_leaf_map_t;
+typedef struct xfs_attr_leaf_entry xfs_attr_leaf_entry_t;
+typedef struct xfs_attr_leaf_name_local xfs_attr_leaf_name_local_t;
+typedef struct xfs_attr_leaf_name_remote xfs_attr_leaf_name_remote_t;
+
+/*
+ * Flags used in the leaf_entry[i].flags field.
+ * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
+ * on the system call, they are "or"ed together for various operations.
+ */
+#define XFS_ATTR_LOCAL_BIT	0	/* attr is stored locally */
+#define XFS_ATTR_ROOT_BIT	1	/* limit access to attr to userid 0 */
+#define XFS_ATTR_INCOMPLETE_BIT 7	/* attr in middle of create/delete */
+#define XFS_ATTR_LOCAL		(1 << XFS_ATTR_LOCAL_BIT)
+#define XFS_ATTR_ROOT		(1 << XFS_ATTR_ROOT_BIT)
+#define XFS_ATTR_INCOMPLETE	(1 << XFS_ATTR_INCOMPLETE_BIT)
+
+/*
+ * Alignment for namelist and valuelist entries (since they are mixed
+ * there can be only one alignment value)
+ */
+#define XFS_ATTR_LEAF_NAME_ALIGN	((uint)sizeof(xfs_dablk_t))
+
+/*
+ * Cast typed pointers for "local" and "remote" name/value structs.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_NAME_REMOTE)
+xfs_attr_leaf_name_remote_t *
+xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx);
+#define XFS_ATTR_LEAF_NAME_REMOTE(leafp,idx)	\
+	xfs_attr_leaf_name_remote(leafp,idx)
+#else
+#define XFS_ATTR_LEAF_NAME_REMOTE(leafp,idx)	/* remote name struct ptr */ \
+	((xfs_attr_leaf_name_remote_t *)		\
+	 &((char *)(leafp))[ INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT) ])
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_NAME_LOCAL)
+xfs_attr_leaf_name_local_t *
+xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx);
+#define XFS_ATTR_LEAF_NAME_LOCAL(leafp,idx)	\
+	xfs_attr_leaf_name_local(leafp,idx)
+#else
+#define XFS_ATTR_LEAF_NAME_LOCAL(leafp,idx)	/* local name struct ptr */ \
+	((xfs_attr_leaf_name_local_t *)		\
+	 &((char *)(leafp))[ INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT) ])
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_NAME)
+char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx);
+#define XFS_ATTR_LEAF_NAME(leafp,idx)		xfs_attr_leaf_name(leafp,idx)
+#else
+#define XFS_ATTR_LEAF_NAME(leafp,idx)		/* generic name struct ptr */ \
+	(&((char *)(leafp))[ INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT) ])
+#endif
+
+/*
+ * Calculate total bytes used (including trailing pad for alignment) for
+ * a "local" name/value structure, a "remote" name/value structure, and
+ * a pointer which might be either.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_ENTSIZE_REMOTE)
+int xfs_attr_leaf_entsize_remote(int nlen);
+#define XFS_ATTR_LEAF_ENTSIZE_REMOTE(nlen)	\
+	xfs_attr_leaf_entsize_remote(nlen)
+#else
+#define XFS_ATTR_LEAF_ENTSIZE_REMOTE(nlen)	/* space for remote struct */ \
+	(((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
+	  XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_ENTSIZE_LOCAL)
+int xfs_attr_leaf_entsize_local(int nlen, int vlen);
+#define XFS_ATTR_LEAF_ENTSIZE_LOCAL(nlen,vlen)	\
+	xfs_attr_leaf_entsize_local(nlen,vlen)
+#else
+#define XFS_ATTR_LEAF_ENTSIZE_LOCAL(nlen,vlen)	/* space for local struct */ \
+	(((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) + \
+	  XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX)
+int xfs_attr_leaf_entsize_local_max(int bsize);
+#define XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(bsize)	\
+	xfs_attr_leaf_entsize_local_max(bsize)
+#else
+#define XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(bsize)	/* max local struct size */ \
+	(((bsize) >> 1) + ((bsize) >> 2))
+#endif
+
+
+/*========================================================================
+ * Structure used to pass context around among the routines.
+ *========================================================================*/
+
+typedef struct xfs_attr_list_context {
+	struct xfs_inode		*dp;	/* inode */
+	struct attrlist_cursor_kern	*cursor;/* position in list */
+	struct attrlist			*alist; /* output buffer */
+	int				count;	/* num used entries */
+	int				dupcnt; /* count dup hashvals seen */
+	int				bufsize;/* total buffer size */
+	int				firstu; /* first used byte in buffer */
+	int				flags;	/* from VOP call */
+	int				resynch;/* T/F: resynch with cursor */
+} xfs_attr_list_context_t;
+
+/*
+ * Used to keep a list of "remote value" extents when unlinking an inode.
+ */
+typedef struct xfs_attr_inactive_list {
+	xfs_dablk_t	valueblk;	/* block number of value bytes */
+	int		valuelen;	/* number of bytes in value */
+} xfs_attr_inactive_list_t;
+
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Internal routines when dirsize < XFS_LITINO(mp).
+ */
+int	xfs_attr_shortform_create(struct xfs_da_args *args);
+int	xfs_attr_shortform_add(struct xfs_da_args *add);
+int	xfs_attr_shortform_lookup(struct xfs_da_args *args);
+int	xfs_attr_shortform_getvalue(struct xfs_da_args *args);
+int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
+int	xfs_attr_shortform_remove(struct xfs_da_args *remove);
+int	xfs_attr_shortform_list(struct xfs_attr_list_context *context);
+int	xfs_attr_shortform_replace(struct xfs_da_args *args);
+int	xfs_attr_shortform_allfit(struct xfs_dabuf *bp, struct xfs_inode *dp);
+
+/*
+ * Internal routines when dirsize == XFS_LBSIZE(mp).
+ */
+int	xfs_attr_leaf_to_node(struct xfs_da_args *args);
+int	xfs_attr_leaf_to_shortform(struct xfs_dabuf *bp,
+					  struct xfs_da_args *args);
+int	xfs_attr_leaf_clearflag(struct xfs_da_args *args);
+int	xfs_attr_leaf_setflag(struct xfs_da_args *args);
+int	xfs_attr_leaf_flipflags(xfs_da_args_t *args);
+
+/*
+ * Routines used for growing the Btree.
+ */
+int	xfs_attr_leaf_create(struct xfs_da_args *args, xfs_dablk_t which_block,
+				    struct xfs_dabuf **bpp);
+int	xfs_attr_leaf_split(struct xfs_da_state *state,
+				   struct xfs_da_state_blk *oldblk,
+				   struct xfs_da_state_blk *newblk);
+int	xfs_attr_leaf_lookup_int(struct xfs_dabuf *leaf,
+					struct xfs_da_args *args);
+int	xfs_attr_leaf_getvalue(struct xfs_dabuf *bp, struct xfs_da_args *args);
+int	xfs_attr_leaf_add(struct xfs_dabuf *leaf_buffer,
+				 struct xfs_da_args *args);
+int	xfs_attr_leaf_remove(struct xfs_dabuf *leaf_buffer,
+				    struct xfs_da_args *args);
+int	xfs_attr_leaf_list_int(struct xfs_dabuf *bp,
+				      struct xfs_attr_list_context *context);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+int	xfs_attr_leaf_toosmall(struct xfs_da_state *state, int *retval);
+void	xfs_attr_leaf_unbalance(struct xfs_da_state *state,
+				       struct xfs_da_state_blk *drop_blk,
+				       struct xfs_da_state_blk *save_blk);
+int	xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
+int	xfs_attr_node_inactive(struct xfs_trans **trans, struct xfs_inode *dp,
+				      struct xfs_dabuf *bp, int level);
+int	xfs_attr_leaf_inactive(struct xfs_trans **trans, struct xfs_inode *dp,
+				      struct xfs_dabuf *bp);
+int	xfs_attr_leaf_freextent(struct xfs_trans **trans, struct xfs_inode *dp,
+				       xfs_dablk_t blkno, int blkcnt);
+
+/*
+ * Utility routines.
+ */
+xfs_dahash_t	xfs_attr_leaf_lasthash(struct xfs_dabuf *bp, int *count);
+int	xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp,
+				   struct xfs_dabuf *leaf2_bp);
+int	xfs_attr_leaf_newentsize(struct xfs_da_args *args, int blocksize,
+					int *local);
+int	xfs_attr_leaf_entsize(struct xfs_attr_leafblock *leaf, int index);
+int	xfs_attr_put_listent(struct xfs_attr_list_context *context,
+			     int ns, char *name, int namelen, int valuelen);
+int	xfs_attr_rolltrans(struct xfs_trans **transp, struct xfs_inode *dp);
+
+#endif	/* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h
new file mode 100644
index 000000000000..6071fe0e571b
--- /dev/null
+++ b/fs/xfs/xfs_attr_sf.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ATTR_SF_H__
+#define __XFS_ATTR_SF_H__
+
+/*
+ * Attribute storage when stored inside the inode.
+ *
+ * Small attribute lists are packed as tightly as possible so as
+ * to fit into the literal area of the inode.
+ */
+
+struct xfs_inode;
+
+/*
+ * Entries are packed toward the top as tight as possible.
+ */
+typedef struct xfs_attr_shortform {
+	struct xfs_attr_sf_hdr {	/* constant-structure header block */
+		__uint16_t totsize;	/* total bytes in shortform list */
+		__uint8_t count;	/* count of active entries */
+	} hdr;
+	struct xfs_attr_sf_entry {
+		__uint8_t namelen;	/* actual length of name (no NULL) */
+		__uint8_t valuelen;	/* actual length of value (no NULL) */
+		__uint8_t flags;	/* flags bits (see xfs_attr_leaf.h) */
+		__uint8_t nameval[1];	/* name & value bytes concatenated */
+	} list[1];			/* variable sized array */
+} xfs_attr_shortform_t;
+typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
+typedef struct xfs_attr_sf_entry xfs_attr_sf_entry_t;
+
+/*
+ * We generate this then sort it, attr_list() must return things in hash-order.
+ */
+typedef struct xfs_attr_sf_sort {
+	__uint8_t	entno;		/* entry number in original list */
+	__uint8_t	namelen;	/* length of name value (no null) */
+	__uint8_t	valuelen;	/* length of value */
+	__uint8_t	flags;		/* flags bits (see xfs_attr_leaf.h) */
+	xfs_dahash_t	hash;		/* this entry's hash value */
+	char		*name;		/* name value, pointer into buffer */
+} xfs_attr_sf_sort_t;
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_ENTSIZE_BYNAME)
+int xfs_attr_sf_entsize_byname(int nlen, int vlen);
+#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen)	\
+	xfs_attr_sf_entsize_byname(nlen,vlen)
+#else
+#define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen)	/* space name/value uses */ \
+	((int)sizeof(xfs_attr_sf_entry_t)-1 + (nlen)+(vlen))
+#endif
+#define XFS_ATTR_SF_ENTSIZE_MAX			/* max space for name&value */ \
+	((1 << (NBBY*(int)sizeof(__uint8_t))) - 1)
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_ENTSIZE)
+int xfs_attr_sf_entsize(xfs_attr_sf_entry_t *sfep);
+#define XFS_ATTR_SF_ENTSIZE(sfep)	xfs_attr_sf_entsize(sfep)
+#else
+#define XFS_ATTR_SF_ENTSIZE(sfep)		/* space an entry uses */ \
+	((int)sizeof(xfs_attr_sf_entry_t)-1 + (sfep)->namelen+(sfep)->valuelen)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_NEXTENTRY)
+xfs_attr_sf_entry_t *xfs_attr_sf_nextentry(xfs_attr_sf_entry_t *sfep);
+#define XFS_ATTR_SF_NEXTENTRY(sfep)	xfs_attr_sf_nextentry(sfep)
+#else
+#define XFS_ATTR_SF_NEXTENTRY(sfep)		/* next entry in struct */ \
+	((xfs_attr_sf_entry_t *) \
+		((char *)(sfep) + XFS_ATTR_SF_ENTSIZE(sfep)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ATTR_SF_TOTSIZE)
+int xfs_attr_sf_totsize(struct xfs_inode *dp);
+#define XFS_ATTR_SF_TOTSIZE(dp)		xfs_attr_sf_totsize(dp)
+#else
+#define XFS_ATTR_SF_TOTSIZE(dp)			/* total space in use */ \
+	(INT_GET(((xfs_attr_shortform_t *)((dp)->i_afp->if_u1.if_data))->hdr.totsize, ARCH_CONVERT))
+#endif
+
+#ifdef XFS_ALL_TRACE
+#define XFS_ATTR_TRACE
+#endif
+
+#if !defined(DEBUG)
+#undef	XFS_ATTR_TRACE
+#endif
+
+/*
+ * Kernel tracing support for attribute lists
+ */
+struct xfs_attr_list_context;
+struct xfs_da_intnode;
+struct xfs_da_node_entry;
+struct xfs_attr_leafblock;
+
+#define XFS_ATTR_TRACE_SIZE	4096	/* size of global trace buffer */
+
+/*
+ * Trace record types.
+ */
+#define XFS_ATTR_KTRACE_L_C	1	/* context */
+#define XFS_ATTR_KTRACE_L_CN	2	/* context, node */
+#define XFS_ATTR_KTRACE_L_CB	3	/* context, btree */
+#define XFS_ATTR_KTRACE_L_CL	4	/* context, leaf */
+
+#if defined(XFS_ATTR_TRACE)
+
+void xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context);
+void xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context,
+			      struct xfs_da_intnode *node);
+void xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context,
+			      struct xfs_da_node_entry *btree);
+void xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
+			      struct xfs_attr_leafblock *leaf);
+void xfs_attr_trace_enter(int type, char *where,
+			     __psunsigned_t a2, __psunsigned_t a3,
+			     __psunsigned_t a4, __psunsigned_t a5,
+			     __psunsigned_t a6, __psunsigned_t a7,
+			     __psunsigned_t a8, __psunsigned_t a9,
+			     __psunsigned_t a10, __psunsigned_t a11,
+			     __psunsigned_t a12, __psunsigned_t a13,
+			     __psunsigned_t a14, __psunsigned_t a15);
+#else
+#define xfs_attr_trace_l_c(w,c)
+#define xfs_attr_trace_l_cn(w,c,n)
+#define xfs_attr_trace_l_cb(w,c,b)
+#define xfs_attr_trace_l_cl(w,c,l)
+#endif /* XFS_ATTR_TRACE */
+
+#endif	/* __XFS_ATTR_SF_H__ */
diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c
new file mode 100644
index 000000000000..e5deb3dc40e6
--- /dev/null
+++ b/fs/xfs/xfs_bit.c
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * XFS bit manipulation routines, used in non-realtime code.
+ */
+
+#include <xfs.h>
+
+#ifndef HAVE_ARCH_HIGHBIT
+/*
+ * Index of high bit number in byte, -1 for none set, 0..7 otherwise.
+ */
+const char xfs_highbit[256] = {
+       -1, 0, 1, 1, 2, 2, 2, 2,			/* 00 .. 07 */
+	3, 3, 3, 3, 3, 3, 3, 3,			/* 08 .. 0f */
+	4, 4, 4, 4, 4, 4, 4, 4,			/* 10 .. 17 */
+	4, 4, 4, 4, 4, 4, 4, 4,			/* 18 .. 1f */
+	5, 5, 5, 5, 5, 5, 5, 5,			/* 20 .. 27 */
+	5, 5, 5, 5, 5, 5, 5, 5,			/* 28 .. 2f */
+	5, 5, 5, 5, 5, 5, 5, 5,			/* 30 .. 37 */
+	5, 5, 5, 5, 5, 5, 5, 5,			/* 38 .. 3f */
+	6, 6, 6, 6, 6, 6, 6, 6,			/* 40 .. 47 */
+	6, 6, 6, 6, 6, 6, 6, 6,			/* 48 .. 4f */
+	6, 6, 6, 6, 6, 6, 6, 6,			/* 50 .. 57 */
+	6, 6, 6, 6, 6, 6, 6, 6,			/* 58 .. 5f */
+	6, 6, 6, 6, 6, 6, 6, 6,			/* 60 .. 67 */
+	6, 6, 6, 6, 6, 6, 6, 6,			/* 68 .. 6f */
+	6, 6, 6, 6, 6, 6, 6, 6,			/* 70 .. 77 */
+	6, 6, 6, 6, 6, 6, 6, 6,			/* 78 .. 7f */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* 80 .. 87 */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* 88 .. 8f */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* 90 .. 97 */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* 98 .. 9f */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* a0 .. a7 */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* a8 .. af */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* b0 .. b7 */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* b8 .. bf */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* c0 .. c7 */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* c8 .. cf */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* d0 .. d7 */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* d8 .. df */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* e0 .. e7 */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* e8 .. ef */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* f0 .. f7 */
+	7, 7, 7, 7, 7, 7, 7, 7,			/* f8 .. ff */
+};
+#endif
+
+/*
+ * Count of bits set in byte, 0..8.
+ */
+static const char xfs_countbit[256] = {
+	0, 1, 1, 2, 1, 2, 2, 3,			/* 00 .. 07 */
+	1, 2, 2, 3, 2, 3, 3, 4,			/* 08 .. 0f */
+	1, 2, 2, 3, 2, 3, 3, 4,			/* 10 .. 17 */
+	2, 3, 3, 4, 3, 4, 4, 5,			/* 18 .. 1f */
+	1, 2, 2, 3, 2, 3, 3, 4,			/* 20 .. 27 */
+	2, 3, 3, 4, 3, 4, 4, 5,			/* 28 .. 2f */
+	2, 3, 3, 4, 3, 4, 4, 5,			/* 30 .. 37 */
+	3, 4, 4, 5, 4, 5, 5, 6,			/* 38 .. 3f */
+	1, 2, 2, 3, 2, 3, 3, 4,			/* 40 .. 47 */
+	2, 3, 3, 4, 3, 4, 4, 5,			/* 48 .. 4f */
+	2, 3, 3, 4, 3, 4, 4, 5,			/* 50 .. 57 */
+	3, 4, 4, 5, 4, 5, 5, 6,			/* 58 .. 5f */
+	2, 3, 3, 4, 3, 4, 4, 5,			/* 60 .. 67 */
+	3, 4, 4, 5, 4, 5, 5, 6,			/* 68 .. 6f */
+	3, 4, 4, 5, 4, 5, 5, 6,			/* 70 .. 77 */
+	4, 5, 5, 6, 5, 6, 6, 7,			/* 78 .. 7f */
+	1, 2, 2, 3, 2, 3, 3, 4,			/* 80 .. 87 */
+	2, 3, 3, 4, 3, 4, 4, 5,			/* 88 .. 8f */
+	2, 3, 3, 4, 3, 4, 4, 5,			/* 90 .. 97 */
+	3, 4, 4, 5, 4, 5, 5, 6,			/* 98 .. 9f */
+	2, 3, 3, 4, 3, 4, 4, 5,			/* a0 .. a7 */
+	3, 4, 4, 5, 4, 5, 5, 6,			/* a8 .. af */
+	3, 4, 4, 5, 4, 5, 5, 6,			/* b0 .. b7 */
+	4, 5, 5, 6, 5, 6, 6, 7,			/* b8 .. bf */
+	2, 3, 3, 4, 3, 4, 4, 5,			/* c0 .. c7 */
+	3, 4, 4, 5, 4, 5, 5, 6,			/* c8 .. cf */
+	3, 4, 4, 5, 4, 5, 5, 6,			/* d0 .. d7 */
+	4, 5, 5, 6, 5, 6, 6, 7,			/* d8 .. df */
+	3, 4, 4, 5, 4, 5, 5, 6,			/* e0 .. e7 */
+	4, 5, 5, 6, 5, 6, 6, 7,			/* e8 .. ef */
+	4, 5, 5, 6, 5, 6, 6, 7,			/* f0 .. f7 */
+	5, 6, 6, 7, 6, 7, 7, 8,			/* f8 .. ff */
+};
+
+/*
+ * xfs_highbit32: get high bit set out of 32-bit argument, -1 if none set.
+ */
+int inline
+xfs_highbit32(
+	__uint32_t	v)
+{
+#ifdef HAVE_ARCH_HIGHBIT
+	return highbit32(v);
+#else
+	int		i;
+
+	if (v & 0xffff0000)
+		if (v & 0xff000000)
+			i = 24;
+		else
+			i = 16;
+	else if (v & 0x0000ffff)
+		if (v & 0x0000ff00)
+			i = 8;
+		else
+			i = 0;
+	else
+		return -1;
+	return i + xfs_highbit[(v >> i) & 0xff];
+#endif
+}
+
+/*
+ * xfs_lowbit64: get low bit set out of 64-bit argument, -1 if none set.
+ */
+int
+xfs_lowbit64(
+	__uint64_t	v)
+{
+	int n;
+	n = ffs((unsigned)v);
+	if (n == 0) {
+		n = ffs(v >> 32);
+		if (n >= 0)
+			n+=32;
+	}
+	return n-1;
+}
+
+/*
+ * xfs_highbit64: get high bit set out of 64-bit argument, -1 if none set.
+ */
+int
+xfs_highbit64(
+	__uint64_t	v)
+{
+	__uint32_t h = v >> 32;
+	if (h)
+		return xfs_highbit32(h) + 32;
+	return xfs_highbit32((__u32)v);
+}
+
+
+/*
+ * Count the number of bits set in the bitmap starting with bit
+ * start_bit.  Size is the size of the bitmap in words.
+ *
+ * Do the counting by mapping a byte value to the number of set
+ * bits for that value using the xfs_countbit array, i.e.
+ * xfs_countbit[0] == 0, xfs_countbit[1] == 1, xfs_countbit[2] == 1,
+ * xfs_countbit[3] == 2, etc.
+ */
+int
+xfs_count_bits(uint *map, uint size, uint start_bit)
+{
+	register int	bits;
+	register unsigned char	*bytep;
+	register unsigned char	*end_map;
+	int		byte_bit;
+
+	bits = 0;
+	end_map = (char*)(map + size);
+	bytep = (char*)(map + (start_bit & ~0x7));
+	byte_bit = start_bit & 0x7;
+
+	/*
+	 * If the caller fell off the end of the map, return 0.
+	 */
+	if (bytep >= end_map) {
+		return (0);
+	}
+
+	/*
+	 * If start_bit is not byte aligned, then process the
+	 * first byte separately.
+	 */
+	if (byte_bit != 0) {
+		/*
+		 * Shift off the bits we don't want to look at,
+		 * before indexing into xfs_countbit.
+		 */
+		bits += xfs_countbit[(*bytep >> byte_bit)];
+		bytep++;
+	}
+
+	/*
+	 * Count the bits in each byte until the end of the bitmap.
+	 */
+	while (bytep < end_map) {
+		bits += xfs_countbit[*bytep];
+		bytep++;
+	}
+
+	return (bits);
+}
+
+/*
+ * Count the number of contiguous bits set in the bitmap starting with bit
+ * start_bit.  Size is the size of the bitmap in words.
+ */
+int
+xfs_contig_bits(uint *map, uint size, uint start_bit)
+{
+#if BITS_PER_LONG == 32
+	return find_next_zero_bit((unsigned long *)map,
+			size * sizeof(uint) * 8, start_bit) - start_bit;
+#else
+	/*
+	 * The first argument to find_next_zero_bit needs to be aligned,
+	 * but this is coming from the xfs_buf_log_format_t on-disk
+	 * struct, which can't be padded or otherwise modified w/o breaking
+	 * on-disk compatibility... so create a temporary, aligned
+	 * variable, copy over the bitmap, and send that to find_next_zero_bit
+	 * This only happens in recovery, so it's ugly but not too bad.
+	 */
+	void * addr;
+	int bit;
+	size_t bitmap_size = size * sizeof(uint);
+
+	addr = (void *)kmem_alloc(bitmap_size, KM_SLEEP);
+	memcpy(addr, map, size * sizeof(uint));
+
+	bit = find_next_zero_bit((unsigned long *)addr,
+			size * sizeof(uint) * 8, start_bit) - start_bit;
+
+	kmem_free(addr, bitmap_size);
+
+	return bit;
+#endif
+}
+
+/*
+ * This takes the bit number to start looking from and
+ * returns the next set bit from there.	 It returns -1
+ * if there are no more bits set or the start bit is
+ * beyond the end of the bitmap.
+ *
+ * Size is the number of words, not bytes, in the bitmap.
+ */
+int xfs_next_bit(uint *map, uint size, uint start_bit)
+{
+	uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT);
+	uint result = start_bit & ~(NBWORD - 1);
+	uint tmp;
+
+	size <<= BIT_TO_WORD_SHIFT;
+
+	if (start_bit >= size)
+		return -1;
+	size -= result;
+	start_bit &= (NBWORD - 1);
+	if (start_bit) {
+		tmp = *p++;
+		/* set to zero first offset bits */
+		tmp &= (~0U << start_bit);
+		if (size < NBWORD)
+			goto found_first;
+		if (tmp != 0U)
+			goto found_middle;
+		size -= NBWORD;
+		result += NBWORD;
+	}
+	while (size >= NBWORD) {
+		if ((tmp = *p++) != 0U)
+			goto found_middle;
+		result += NBWORD;
+		size -= NBWORD;
+	}
+	if (!size)
+		return -1;
+	tmp = *p;
+found_first:
+found_middle:
+	return result + ffs(tmp) - 1;
+}
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
new file mode 100644
index 000000000000..ed5483fa30de
--- /dev/null
+++ b/fs/xfs/xfs_bit.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_BIT_H__
+#define __XFS_BIT_H__
+
+/*
+ * XFS bit manipulation routines.
+ */
+
+/*
+ * masks with n high/low bits set, 32-bit values & 64-bit values
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK32HI)
+__uint32_t xfs_mask32hi(int n);
+#define XFS_MASK32HI(n)		xfs_mask32hi(n)
+#else
+#define XFS_MASK32HI(n)		((__uint32_t)-1 << (32 - (n)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK64HI)
+__uint64_t xfs_mask64hi(int n);
+#define XFS_MASK64HI(n)		xfs_mask64hi(n)
+#else
+#define XFS_MASK64HI(n)		((__uint64_t)-1 << (64 - (n)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK32LO)
+__uint32_t xfs_mask32lo(int n);
+#define XFS_MASK32LO(n)		xfs_mask32lo(n)
+#else
+#define XFS_MASK32LO(n)		(((__uint32_t)1 << (n)) - 1)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MASK64LO)
+__uint64_t xfs_mask64lo(int n);
+#define XFS_MASK64LO(n)		xfs_mask64lo(n)
+#else
+#define XFS_MASK64LO(n)		(((__uint64_t)1 << (n)) - 1)
+#endif
+
+/* Get high bit set out of 32-bit argument, -1 if none set */
+extern int xfs_highbit32(__uint32_t v);
+
+/* Get low bit set out of 64-bit argument, -1 if none set */
+extern int xfs_lowbit64(__uint64_t v);
+
+/* Get high bit set out of 64-bit argument, -1 if none set */
+extern int xfs_highbit64(__uint64_t);
+
+/* Count set bits in map starting with start_bit */
+extern int xfs_count_bits(uint *map, uint size, uint start_bit);
+
+/* Count continuous one bits in map starting with start_bit */
+extern int xfs_contig_bits(uint *map, uint size, uint start_bit);
+
+/* Find next set bit in map */
+extern int xfs_next_bit(uint *map, uint size, uint start_bit);
+
+#endif	/* __XFS_BIT_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
new file mode 100644
index 000000000000..6bf9632238e7
--- /dev/null
+++ b/fs/xfs/xfs_bmap.c
@@ -0,0 +1,6323 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+#ifdef DEBUG
+ktrace_t	*xfs_bmap_trace_buf;
+#endif
+
+#ifdef XFSDEBUG
+STATIC void
+xfs_bmap_check_leaf_extents(xfs_btree_cur_t *cur, xfs_inode_t *ip, int whichfork);
+#endif
+
+kmem_zone_t		*xfs_bmap_free_item_zone;
+
+/*
+ * Prototypes for internal bmap routines.
+ */
+
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle extents format files.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_extents(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first block allocated */
+	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
+	int			*flags);	/* inode logging flags */
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle local format files.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_local(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first block allocated */
+	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
+	int			*flags);	/* inode logging flags */
+
+/*
+ * Called by xfs_bmapi to update extent list structure and the btree
+ * after allocating space (or doing a delayed allocation).
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		**curp, /* if *curp is null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to put in extent list */
+	xfs_fsblock_t		*first, /* pointer to firstblock variable */
+	xfs_bmap_free_t		*flist, /* list of extents to be freed */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork, /* data or attr fork */
+	int			rsvd);	/* OK to allocate reserved blocks */
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a delayed
+ * allocation to a real allocation.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_delay_real(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		**curp, /* if *curp is null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to put in extent list */
+	xfs_filblks_t		*dnew,	/* new delayed-alloc indirect blocks */
+	xfs_fsblock_t		*first, /* pointer to firstblock variable */
+	xfs_bmap_free_t		*flist, /* list of extents to be freed */
+	int			*logflagsp, /* inode logging flags */
+	int			rsvd);	/* OK to allocate reserved blocks */
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a hole
+ * to a delayed allocation.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_hole_delay(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		*cur,	/* if null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to put in extent list */
+	int			*logflagsp,/* inode logging flags */
+	int			rsvd);	/* OK to allocate reserved blocks */
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a hole
+ * to a real allocation.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_hole_real(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		*cur,	/* if null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to put in extent list */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork); /* data or attr fork */
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting an unwritten
+ * allocation to a real allocation or vice versa.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_unwritten_real(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		**curp, /* if *curp is null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to put in extent list */
+	int			*logflagsp); /* inode logging flags */
+
+/*
+ * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
+ * It figures out where to ask the underlying allocator to put the new extent.
+ */
+STATIC int				/* error */
+xfs_bmap_alloc(
+	xfs_bmalloca_t		*ap);	/* bmap alloc argument struct */
+
+/*
+ * Transform a btree format file with only one leaf node, where the
+ * extents list will fit in the inode, into an extents format file.
+ * Since the extent list is already in-core, all we have to do is
+ * give up the space for the btree root and pitch the leaf block.
+ */
+STATIC int				/* error */
+xfs_bmap_btree_to_extents(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork,  /* data or attr fork */
+	int			async);	    /* xaction can be async */
+
+#ifdef XFSDEBUG
+/*
+ * Check that the extents list for the inode ip is in the right order.
+ */
+STATIC void
+xfs_bmap_check_extents(
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	int			whichfork);	/* data or attr fork */
+#else
+#define xfs_bmap_check_extents(ip,w)
+#endif
+
+/*
+ * Called by xfs_bmapi to update extent list structure and the btree
+ * after removing space (or undoing a delayed allocation).
+ */
+STATIC int				/* error */
+xfs_bmap_del_extent(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_trans_t		*tp,	/* current trans pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_bmap_free_t		*flist, /* list of extents to be freed */
+	xfs_btree_cur_t		*cur,	/* if null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to put in extent list */
+	int			iflags, /* input flags (meta-data or not) */
+	int			*logflagsp,/* inode logging flags */
+	int			whichfork, /* data or attr fork */
+	int			rsvd);	 /* OK to allocate reserved blocks */
+
+/*
+ * Remove the entry "free" from the free item list.  Prev points to the
+ * previous entry, unless "free" is the head of the list.
+ */
+STATIC void
+xfs_bmap_del_free(
+	xfs_bmap_free_t		*flist, /* free item list header */
+	xfs_bmap_free_item_t	*prev,	/* previous item on list, if any */
+	xfs_bmap_free_item_t	*free); /* list item to be freed */
+
+/*
+ * Remove count entries from the extents array for inode "ip", starting
+ * at index "idx".  Copies the remaining items down over the deleted ones,
+ * and gives back the excess memory.
+ */
+STATIC void
+xfs_bmap_delete_exlist(
+	xfs_inode_t	*ip,		/* incode inode pointer */
+	xfs_extnum_t	idx,		/* starting delete index */
+	xfs_extnum_t	count,		/* count of items to delete */
+	int		whichfork);	/* data or attr fork */
+
+/*
+ * Convert an extents-format file into a btree-format file.
+ * The new file will have a root block (in the inode) and a single child block.
+ */
+STATIC int					/* error */
+xfs_bmap_extents_to_btree(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first-block-allocated */
+	xfs_bmap_free_t		*flist,		/* blocks freed in xaction */
+	xfs_btree_cur_t		**curp,		/* cursor returned to caller */
+	int			wasdel,		/* converting a delayed alloc */
+	int			*logflagsp,	/* inode logging flags */
+	int			whichfork);	/* data or attr fork */
+
+/*
+ * Insert new item(s) in the extent list for inode "ip".
+ * Count new items are inserted at offset idx.
+ */
+STATIC void
+xfs_bmap_insert_exlist(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* starting index of new items */
+	xfs_extnum_t	count,		/* number of inserted items */
+	xfs_bmbt_irec_t *new,		/* items to insert */
+	int		whichfork);	/* data or attr fork */
+
+/*
+ * Convert a local file to an extents file.
+ * This code is sort of bogus, since the file data needs to get
+ * logged so it won't be lost.	The bmap-level manipulations are ok, though.
+ */
+STATIC int				/* error */
+xfs_bmap_local_to_extents(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_fsblock_t	*firstblock,	/* first block allocated in xaction */
+	xfs_extlen_t	total,		/* total blocks needed by transaction */
+	int		*logflagsp,	/* inode logging flags */
+	int		whichfork);	/* data or attr fork */
+
+/*
+ * Search the extents list for the inode, for the extent containing bno.
+ * If bno lies in a hole, point to the next entry.  If bno lies past eof,
+ * *eofp will be set, and *prevp will contain the last entry (null if none).
+ * Else, *lastxp will be set to the index of the found
+ * entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_t *			/* pointer to found extent entry */
+xfs_bmap_search_extents(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_fileoff_t	bno,		/* block number searched for */
+	int		whichfork,	/* data or attr fork */
+	int		*eofp,		/* out: end of file found */
+	xfs_extnum_t	*lastxp,	/* out: last extent index */
+	xfs_bmbt_irec_t *gotp,		/* out: extent entry found */
+	xfs_bmbt_irec_t *prevp);	/* out: previous extent entry found */
+
+#ifdef XFS_BMAP_TRACE
+/*
+ * Add a bmap trace buffer entry.  Base routine for the others.
+ */
+STATIC void
+xfs_bmap_trace_addentry(
+	int		opcode,		/* operation */
+	char		*fname,		/* function name */
+	char		*desc,		/* operation description */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* index of entry(ies) */
+	xfs_extnum_t	cnt,		/* count of entries, 1 or 2 */
+	xfs_bmbt_rec_t	*r1,		/* first record */
+	xfs_bmbt_rec_t	*r2,		/* second record or null */
+	int		whichfork);	/* data or attr fork */
+
+/*
+ * Add bmap trace entry prior to a call to xfs_bmap_delete_exlist.
+ */
+STATIC void
+xfs_bmap_trace_delete(
+	char		*fname,		/* function name */
+	char		*desc,		/* operation description */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* index of entry(entries) deleted */
+	xfs_extnum_t	cnt,		/* count of entries deleted, 1 or 2 */
+	int		whichfork);	/* data or attr fork */
+
+/*
+ * Add bmap trace entry prior to a call to xfs_bmap_insert_exlist, or
+ * reading in the extents list from the disk (in the btree).
+ */
+STATIC void
+xfs_bmap_trace_insert(
+	char		*fname,		/* function name */
+	char		*desc,		/* operation description */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* index of entry(entries) inserted */
+	xfs_extnum_t	cnt,		/* count of entries inserted, 1 or 2 */
+	xfs_bmbt_irec_t *r1,		/* inserted record 1 */
+	xfs_bmbt_irec_t *r2,		/* inserted record 2 or null */
+	int		whichfork);	/* data or attr fork */
+
+/*
+ * Add bmap trace entry after updating an extent list entry in place.
+ */
+STATIC void
+xfs_bmap_trace_post_update(
+	char		*fname,		/* function name */
+	char		*desc,		/* operation description */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* index of entry updated */
+	int		whichfork);	/* data or attr fork */
+
+/*
+ * Add bmap trace entry prior to updating an extent list entry in place.
+ */
+STATIC void
+xfs_bmap_trace_pre_update(
+	char		*fname,		/* function name */
+	char		*desc,		/* operation description */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* index of entry to be updated */
+	int		whichfork);	/* data or attr fork */
+
+#else
+#define xfs_bmap_trace_delete(f,d,ip,i,c,w)
+#define xfs_bmap_trace_insert(f,d,ip,i,c,r1,r2,w)
+#define xfs_bmap_trace_post_update(f,d,ip,i,w)
+#define xfs_bmap_trace_pre_update(f,d,ip,i,w)
+#endif	/* XFS_BMAP_TRACE */
+
+/*
+ * Compute the worst-case number of indirect blocks that will be used
+ * for ip's delayed extent of length "len".
+ */
+STATIC xfs_filblks_t
+xfs_bmap_worst_indlen(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_filblks_t		len);	/* delayed extent length */
+
+#ifdef DEBUG
+/*
+ * Perform various validation checks on the values being returned
+ * from xfs_bmapi().
+ */
+STATIC void
+xfs_bmap_validate_ret(
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	int			flags,
+	xfs_bmbt_irec_t		*mval,
+	int			nmap,
+	int			ret_nmap);
+#else
+#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
+#endif /* DEBUG */
+
+#if defined(DEBUG) && defined(XFS_RW_TRACE)
+STATIC void
+xfs_bunmap_trace(
+	xfs_inode_t		*ip,
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	int			flags,
+	inst_t			*ra);
+#else
+#define xfs_bunmap_trace(ip, bno, len, flags, ra)
+#endif	/* DEBUG && XFS_RW_TRACE */
+
+STATIC int
+xfs_bmap_count_tree(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_fsblock_t	blockno,
+	int		levelin,
+	int		*count);
+
+STATIC int
+xfs_bmap_count_leaves(
+	xfs_bmbt_rec_t		*frp,
+	int			numrecs,
+	int			*count);
+
+/*
+ * Bmap internal routines.
+ */
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle btree format files.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_btree(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first block allocated */
+	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
+	int			*flags)		/* inode logging flags */
+{
+	xfs_btree_cur_t		*cur;		/* btree cursor */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* file system mount struct */
+	int			stat;		/* newroot status */
+
+	mp = ip->i_mount;
+	if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip))
+		*flags |= XFS_ILOG_DBROOT;
+	else {
+		cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
+			XFS_DATA_FORK);
+		cur->bc_private.b.flist = flist;
+		cur->bc_private.b.firstblock = *firstblock;
+		if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
+			goto error0;
+		ASSERT(stat == 1);	/* must be at least one entry */
+		if ((error = xfs_bmbt_newroot(cur, flags, &stat)))
+			goto error0;
+		if (stat == 0) {
+			xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+			return XFS_ERROR(ENOSPC);
+		}
+		*firstblock = cur->bc_private.b.firstblock;
+		cur->bc_private.b.allocated = 0;
+		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	}
+	return 0;
+error0:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle extents format files.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_extents(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first block allocated */
+	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
+	int			*flags)		/* inode logging flags */
+{
+	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
+	int			error;		/* error return value */
+
+	if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip))
+		return 0;
+	cur = NULL;
+	error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0,
+		flags, XFS_DATA_FORK);
+	if (cur) {
+		cur->bc_private.b.allocated = 0;
+		xfs_btree_del_cursor(cur,
+			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	}
+	return error;
+}
+
+/*
+ * Called from xfs_bmap_add_attrfork to handle local format files.
+ */
+STATIC int					/* error */
+xfs_bmap_add_attrfork_local(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first block allocated */
+	xfs_bmap_free_t		*flist,		/* blocks to free at commit */
+	int			*flags)		/* inode logging flags */
+{
+	xfs_da_args_t		dargs;		/* args for dir/attr code */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* mount structure pointer */
+
+	if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
+		return 0;
+	if ((ip->i_d.di_mode & IFMT) == IFDIR) {
+		mp = ip->i_mount;
+		bzero(&dargs, sizeof(dargs));
+		dargs.dp = ip;
+		dargs.firstblock = firstblock;
+		dargs.flist = flist;
+		dargs.total = mp->m_dirblkfsbs;
+		dargs.whichfork = XFS_DATA_FORK;
+		dargs.trans = tp;
+		error = XFS_DIR_SHORTFORM_TO_SINGLE(mp, &dargs);
+	} else
+		error = xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags,
+			XFS_DATA_FORK);
+	return error;
+}
+
+/*
+ * Called by xfs_bmapi to update extent list structure and the btree
+ * after allocating space (or doing a delayed allocation).
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		**curp, /* if *curp is null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to put in extent list */
+	xfs_fsblock_t		*first, /* pointer to firstblock variable */
+	xfs_bmap_free_t		*flist, /* list of extents to be freed */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork, /* data or attr fork */
+	int			rsvd)	/* OK to use reserved data blocks */
+{
+	xfs_btree_cur_t		*cur;	/* btree cursor or null */
+	xfs_filblks_t		da_new; /* new count del alloc blocks used */
+	xfs_filblks_t		da_old; /* old count del alloc blocks used */
+	int			error;	/* error return value */
+#ifdef XFS_BMAP_TRACE
+	static char		fname[] = "xfs_bmap_add_extent";
+#endif
+	xfs_ifork_t		*ifp;	/* inode fork ptr */
+	int			logflags; /* returned value */
+	xfs_extnum_t		nextents; /* number of extents in file now */
+
+	XFS_STATS_INC(xfsstats.xs_add_exlist);
+	cur = *curp;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	ASSERT(idx <= nextents);
+	da_old = da_new = 0;
+	error = 0;
+	/*
+	 * This is the first extent added to a new/empty file.
+	 * Special case this one, so other routines get to assume there are
+	 * already extents in the list.
+	 */
+	if (nextents == 0) {
+		xfs_bmap_trace_insert(fname, "insert empty", ip, 0, 1, new,
+			NULL, whichfork);
+		xfs_bmap_insert_exlist(ip, 0, 1, new, whichfork);
+		ASSERT(cur == NULL);
+		ifp->if_lastex = 0;
+		if (!ISNULLSTARTBLOCK(new->br_startblock)) {
+			XFS_IFORK_NEXT_SET(ip, whichfork, 1);
+			logflags = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+		} else
+			logflags = 0;
+	}
+	/*
+	 * Any kind of new delayed allocation goes here.
+	 */
+	else if (ISNULLSTARTBLOCK(new->br_startblock)) {
+		if (cur)
+			ASSERT((cur->bc_private.b.flags &
+				XFS_BTCUR_BPRV_WASDEL) == 0);
+		if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, cur, new,
+				&logflags, rsvd)))
+			goto done;
+	}
+	/*
+	 * Real allocation off the end of the file.
+	 */
+	else if (idx == nextents) {
+		if (cur)
+			ASSERT((cur->bc_private.b.flags &
+				XFS_BTCUR_BPRV_WASDEL) == 0);
+		if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
+				&logflags, whichfork)))
+			goto done;
+	} else {
+		xfs_bmbt_irec_t prev;	/* old extent at offset idx */
+
+		/*
+		 * Get the record referred to by idx.
+		 */
+		xfs_bmbt_get_all(&ifp->if_u1.if_extents[idx], &prev);
+		/*
+		 * If it's a real allocation record, and the new allocation ends
+		 * after the start of the referred to record, then we're filling
+		 * in a delayed or unwritten allocation with a real one, or
+		 * converting real back to unwritten.
+		 */
+		if (!ISNULLSTARTBLOCK(new->br_startblock) &&
+		    new->br_startoff + new->br_blockcount > prev.br_startoff) {
+			if (prev.br_state != XFS_EXT_UNWRITTEN &&
+			    ISNULLSTARTBLOCK(prev.br_startblock)) {
+				da_old = STARTBLOCKVAL(prev.br_startblock);
+				if (cur)
+					ASSERT(cur->bc_private.b.flags &
+						XFS_BTCUR_BPRV_WASDEL);
+				if ((error = xfs_bmap_add_extent_delay_real(ip,
+					idx, &cur, new, &da_new, first, flist,
+					&logflags, rsvd)))
+					goto done;
+			} else if (new->br_state == XFS_EXT_NORM) {
+				ASSERT(new->br_state == XFS_EXT_NORM);
+				if ((error = xfs_bmap_add_extent_unwritten_real(
+					ip, idx, &cur, new, &logflags)))
+					goto done;
+			} else {
+				ASSERT(new->br_state == XFS_EXT_UNWRITTEN);
+				if ((error = xfs_bmap_add_extent_unwritten_real(
+					ip, idx, &cur, new, &logflags)))
+					goto done;
+			}
+			ASSERT(*curp == cur || *curp == NULL);
+		}
+		/*
+		 * Otherwise we're filling in a hole with an allocation.
+		 */
+		else {
+			if (cur)
+				ASSERT((cur->bc_private.b.flags &
+					XFS_BTCUR_BPRV_WASDEL) == 0);
+			if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
+					new, &logflags, whichfork)))
+				goto done;
+		}
+	}
+
+	ASSERT(*curp == cur || *curp == NULL);
+	/*
+	 * Convert to a btree if necessary.
+	 */
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
+		int	tmp_logflags;	/* partial log flag return val */
+
+		ASSERT(cur == NULL);
+		error = xfs_bmap_extents_to_btree(ip->i_transp, ip, first,
+			flist, &cur, da_old > 0, &tmp_logflags, whichfork);
+		logflags |= tmp_logflags;
+		if (error)
+			goto done;
+	}
+	/*
+	 * Adjust for changes in reserved delayed indirect blocks.
+	 * Nothing to do for disk quotas here.
+	 */
+	if (da_old || da_new) {
+		xfs_filblks_t	nblks;
+
+		nblks = da_new;
+		if (cur)
+			nblks += cur->bc_private.b.allocated;
+		ASSERT(nblks <= da_old);
+		if (nblks < da_old)
+			xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS,
+				(int)(da_old - nblks), rsvd);
+	}
+	/*
+	 * Clear out the allocated field, done with it now in any case.
+	 */
+	if (cur) {
+		cur->bc_private.b.allocated = 0;
+		*curp = cur;
+	}
+done:
+#ifdef XFSDEBUG
+	if (!error)
+		xfs_bmap_check_leaf_extents(*curp, ip, whichfork);
+#endif
+	*logflagsp = logflags;
+	return error;
+}
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a delayed
+ * allocation to a real allocation.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_delay_real(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		**curp, /* if *curp is null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to put in extent list */
+	xfs_filblks_t		*dnew,	/* new delayed-alloc indirect blocks */
+	xfs_fsblock_t		*first, /* pointer to firstblock variable */
+	xfs_bmap_free_t		*flist, /* list of extents to be freed */
+	int			*logflagsp, /* inode logging flags */
+	int			rsvd)	/* OK to use reserved data block allocation */
+{
+	xfs_bmbt_rec_t		*base;	/* base of extent entry list */
+	xfs_btree_cur_t		*cur;	/* btree cursor */
+	int			diff;	/* temp value */
+	xfs_bmbt_rec_t		*ep;	/* extent entry for idx */
+	int			error;	/* error return value */
+#ifdef XFS_BMAP_TRACE
+	static char		fname[] = "xfs_bmap_add_extent_delay_real";
+#endif
+	int			i;	/* temp state */
+	xfs_fileoff_t		new_endoff;	/* end offset of new entry */
+	xfs_bmbt_irec_t		r[3];	/* neighbor extent entries */
+					/* left is 0, right is 1, prev is 2 */
+	int			rval=0; /* return value (logging flags) */
+	int			state = 0;/* state bits, accessed thru macros */
+	xfs_filblks_t		temp;	/* value for dnew calculations */
+	xfs_filblks_t		temp2;	/* value for dnew calculations */
+	int			tmp_rval;	/* partial logging flags */
+	enum {				/* bit number definitions for state */
+		LEFT_CONTIG,	RIGHT_CONTIG,
+		LEFT_FILLING,	RIGHT_FILLING,
+		LEFT_DELAY,	RIGHT_DELAY,
+		LEFT_VALID,	RIGHT_VALID
+	};
+
+#define LEFT		r[0]
+#define RIGHT		r[1]
+#define PREV		r[2]
+#define MASK(b)		(1 << (b))
+#define MASK2(a,b)	(MASK(a) | MASK(b))
+#define MASK3(a,b,c)	(MASK2(a,b) | MASK(c))
+#define MASK4(a,b,c,d)	(MASK3(a,b,c) | MASK(d))
+#define STATE_SET(b,v)	((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
+#define STATE_TEST(b)	(state & MASK(b))
+#define STATE_SET_TEST(b,v)	((v) ? ((state |= MASK(b)), 1) : \
+				       ((state &= ~MASK(b)), 0))
+#define SWITCH_STATE		\
+	(state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG))
+
+	/*
+	 * Set up a bunch of variables to make the tests simpler.
+	 */
+	cur = *curp;
+	base = ip->i_df.if_u1.if_extents;
+	ep = &base[idx];
+	xfs_bmbt_get_all(ep, &PREV);
+	new_endoff = new->br_startoff + new->br_blockcount;
+	ASSERT(PREV.br_startoff <= new->br_startoff);
+	ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+	/*
+	 * Set flags determining what part of the previous delayed allocation
+	 * extent is being replaced by a real allocation.
+	 */
+	STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff);
+	STATE_SET(RIGHT_FILLING,
+		PREV.br_startoff + PREV.br_blockcount == new_endoff);
+	/*
+	 * Check and set flags if this segment has a left neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 */
+	if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
+		xfs_bmbt_get_all(ep - 1, &LEFT);
+		STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock));
+	}
+	STATE_SET(LEFT_CONTIG,
+		STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
+		LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+		LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+		LEFT.br_state == new->br_state &&
+		LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN);
+	/*
+	 * Check and set flags if this segment has a right neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 * Also check for all-three-contiguous being too large.
+	 */
+	if (STATE_SET_TEST(RIGHT_VALID,
+			idx <
+			ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
+		xfs_bmbt_get_all(ep + 1, &RIGHT);
+		STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock));
+	}
+	STATE_SET(RIGHT_CONTIG,
+		STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
+		new_endoff == RIGHT.br_startoff &&
+		new->br_startblock + new->br_blockcount ==
+		    RIGHT.br_startblock &&
+		new->br_state == RIGHT.br_state &&
+		new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+		((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) !=
+		  MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) ||
+		 LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+		     <= MAXEXTLEN));
+	error = 0;
+	/*
+	 * Switch out based on the FILLING and CONTIG state bits.
+	 */
+	switch (SWITCH_STATE) {
+
+	case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * The left and right neighbors are both contiguous with new.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF|RF|LC|RC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep - 1,
+			LEFT.br_blockcount + PREV.br_blockcount +
+			RIGHT.br_blockcount);
+		xfs_bmap_trace_post_update(fname, "LF|RF|LC|RC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmap_trace_delete(fname, "LF|RF|LC|RC", ip, idx, 2,
+			XFS_DATA_FORK);
+		xfs_bmap_delete_exlist(ip, idx, 2, XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx - 1;
+		ip->i_d.di_nextents--;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_delete(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+					LEFT.br_startblock,
+					LEFT.br_blockcount +
+					PREV.br_blockcount +
+					RIGHT.br_blockcount, LEFT.br_state)))
+				goto done;
+		}
+		*dnew = 0;
+		break;
+
+	case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG):
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * The left neighbor is contiguous, the right is not.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF|RF|LC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep - 1,
+			LEFT.br_blockcount + PREV.br_blockcount);
+		xfs_bmap_trace_post_update(fname, "LF|RF|LC", ip, idx - 1,
+			XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx - 1;
+		xfs_bmap_trace_delete(fname, "LF|RF|LC", ip, idx, 1,
+			XFS_DATA_FORK);
+		xfs_bmap_delete_exlist(ip, idx, 1, XFS_DATA_FORK);
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff,
+					LEFT.br_startblock, LEFT.br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+					LEFT.br_startblock,
+					LEFT.br_blockcount +
+					PREV.br_blockcount, LEFT.br_state)))
+				goto done;
+		}
+		*dnew = 0;
+		break;
+
+	case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG):
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * The right neighbor is contiguous, the left is not.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF|RF|RC", ip, idx,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_startblock(ep, new->br_startblock);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount + RIGHT.br_blockcount);
+		xfs_bmap_trace_post_update(fname, "LF|RF|RC", ip, idx,
+			XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx;
+		xfs_bmap_trace_delete(fname, "LF|RF|RC", ip, idx + 1, 1,
+			XFS_DATA_FORK);
+		xfs_bmap_delete_exlist(ip, idx + 1, 1, XFS_DATA_FORK);
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+					new->br_startblock,
+					PREV.br_blockcount +
+					RIGHT.br_blockcount, PREV.br_state)))
+				goto done;
+		}
+		*dnew = 0;
+		break;
+
+	case MASK2(LEFT_FILLING, RIGHT_FILLING):
+		/*
+		 * Filling in all of a previously delayed allocation extent.
+		 * Neither the left nor right neighbors are contiguous with
+		 * the new one.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF|RF", ip, idx,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_startblock(ep, new->br_startblock);
+		xfs_bmap_trace_post_update(fname, "LF|RF", ip, idx,
+			XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx;
+		ip->i_d.di_nextents++;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 0);
+			cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			if ((error = xfs_bmbt_insert(cur, &i)))
+				goto done;
+			ASSERT(i == 1);
+		}
+		*dnew = 0;
+		break;
+
+	case MASK2(LEFT_FILLING, LEFT_CONTIG):
+		/*
+		 * Filling in the first part of a previous delayed allocation.
+		 * The left neighbor is contiguous.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep - 1,
+			LEFT.br_blockcount + new->br_blockcount);
+		xfs_bmbt_set_startoff(ep,
+			PREV.br_startoff + new->br_blockcount);
+		xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx - 1,
+			XFS_DATA_FORK);
+		temp = PREV.br_blockcount - new->br_blockcount;
+		xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep, temp);
+		ip->i_df.if_lastex = idx - 1;
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff,
+					LEFT.br_startblock, LEFT.br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+					LEFT.br_startblock,
+					LEFT.br_blockcount +
+					new->br_blockcount,
+					LEFT.br_state)))
+				goto done;
+		}
+		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+			STARTBLOCKVAL(PREV.br_startblock));
+		xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+		xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx,
+			XFS_DATA_FORK);
+		*dnew = temp;
+		break;
+
+	case MASK(LEFT_FILLING):
+		/*
+		 * Filling in the first part of a previous delayed allocation.
+		 * The left neighbor is not contiguous.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF", ip, idx, XFS_DATA_FORK);
+		xfs_bmbt_set_startoff(ep, new_endoff);
+		temp = PREV.br_blockcount - new->br_blockcount;
+		xfs_bmbt_set_blockcount(ep, temp);
+		xfs_bmap_trace_insert(fname, "LF", ip, idx, 1, new, NULL,
+			XFS_DATA_FORK);
+		xfs_bmap_insert_exlist(ip, idx, 1, new, XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx;
+		ip->i_d.di_nextents++;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 0);
+			cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			if ((error = xfs_bmbt_insert(cur, &i)))
+				goto done;
+			ASSERT(i == 1);
+		}
+		if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+		    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
+			error = xfs_bmap_extents_to_btree(ip->i_transp, ip,
+					first, flist, &cur, 1, &tmp_rval,
+					XFS_DATA_FORK);
+			rval |= tmp_rval;
+			if (error)
+				goto done;
+		}
+		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+			STARTBLOCKVAL(PREV.br_startblock) -
+			(cur ? cur->bc_private.b.allocated : 0));
+		base = ip->i_df.if_u1.if_extents;
+		ep = &base[idx + 1];
+		xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+		xfs_bmap_trace_post_update(fname, "LF", ip, idx + 1,
+			XFS_DATA_FORK);
+		*dnew = temp;
+		break;
+
+	case MASK2(RIGHT_FILLING, RIGHT_CONTIG):
+		/*
+		 * Filling in the last part of a previous delayed allocation.
+		 * The right neighbor is contiguous with the new allocation.
+		 */
+		temp = PREV.br_blockcount - new->br_blockcount;
+		xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx,
+			XFS_DATA_FORK);
+		xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx + 1,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep, temp);
+		xfs_bmbt_set_allf(ep + 1, new->br_startoff, new->br_startblock,
+			new->br_blockcount + RIGHT.br_blockcount,
+			RIGHT.br_state);
+		xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx + 1,
+			XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx + 1;
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+					new->br_startblock,
+					new->br_blockcount +
+					RIGHT.br_blockcount,
+					RIGHT.br_state)))
+				goto done;
+		}
+		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+			STARTBLOCKVAL(PREV.br_startblock));
+		xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+		xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx,
+			XFS_DATA_FORK);
+		*dnew = temp;
+		break;
+
+	case MASK(RIGHT_FILLING):
+		/*
+		 * Filling in the last part of a previous delayed allocation.
+		 * The right neighbor is not contiguous.
+		 */
+		temp = PREV.br_blockcount - new->br_blockcount;
+		xfs_bmap_trace_pre_update(fname, "RF", ip, idx, XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep, temp);
+		xfs_bmap_trace_insert(fname, "RF", ip, idx + 1, 1,
+			new, NULL, XFS_DATA_FORK);
+		xfs_bmap_insert_exlist(ip, idx + 1, 1, new, XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx + 1;
+		ip->i_d.di_nextents++;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 0);
+			cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			if ((error = xfs_bmbt_insert(cur, &i)))
+				goto done;
+			ASSERT(i == 1);
+		}
+		if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+		    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
+			error = xfs_bmap_extents_to_btree(ip->i_transp, ip,
+				first, flist, &cur, 1, &tmp_rval,
+				XFS_DATA_FORK);
+			rval |= tmp_rval;
+			if (error)
+				goto done;
+		}
+		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+			STARTBLOCKVAL(PREV.br_startblock) -
+			(cur ? cur->bc_private.b.allocated : 0));
+		base = ip->i_df.if_u1.if_extents;
+		ep = &base[idx];
+		xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+		xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK);
+		*dnew = temp;
+		break;
+
+	case 0:
+		/*
+		 * Filling in the middle part of a previous delayed allocation.
+		 * Contiguity is impossible here.
+		 * This case is avoided almost all the time.
+		 */
+		temp = new->br_startoff - PREV.br_startoff;
+		xfs_bmap_trace_pre_update(fname, "0", ip, idx, XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep, temp);
+		r[0] = *new;
+		r[1].br_startoff = new_endoff;
+		temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
+		r[1].br_blockcount = temp2;
+		xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 2, &r[0], &r[1],
+			XFS_DATA_FORK);
+		xfs_bmap_insert_exlist(ip, idx + 1, 2, &r[0], XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx + 1;
+		ip->i_d.di_nextents++;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 0);
+			cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			if ((error = xfs_bmbt_insert(cur, &i)))
+				goto done;
+			ASSERT(i == 1);
+		}
+		if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
+		    ip->i_d.di_nextents > ip->i_df.if_ext_max) {
+			error = xfs_bmap_extents_to_btree(ip->i_transp, ip,
+					first, flist, &cur, 1, &tmp_rval,
+					XFS_DATA_FORK);
+			rval |= tmp_rval;
+			if (error)
+				goto done;
+		}
+		temp = xfs_bmap_worst_indlen(ip, temp);
+		temp2 = xfs_bmap_worst_indlen(ip, temp2);
+		diff = (int)(temp + temp2 - STARTBLOCKVAL(PREV.br_startblock) -
+			(cur ? cur->bc_private.b.allocated : 0));
+		if (diff > 0 &&
+		    xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -diff, rsvd)) {
+			/*
+			 * Ick gross gag me with a spoon.
+			 */
+			ASSERT(0);	/* want to see if this ever happens! */
+			while (diff > 0) {
+				if (temp) {
+					temp--;
+					diff--;
+					if (!diff ||
+					    !xfs_mod_incore_sb(ip->i_mount,
+						    XFS_SBS_FDBLOCKS, -diff, rsvd))
+						break;
+				}
+				if (temp2) {
+					temp2--;
+					diff--;
+					if (!diff ||
+					    !xfs_mod_incore_sb(ip->i_mount,
+						    XFS_SBS_FDBLOCKS, -diff, rsvd))
+						break;
+				}
+			}
+		}
+		base = ip->i_df.if_u1.if_extents;
+		ep = &base[idx];
+		xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+		xfs_bmap_trace_post_update(fname, "0", ip, idx, XFS_DATA_FORK);
+		xfs_bmap_trace_pre_update(fname, "0", ip, idx + 2,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_startblock(ep + 2, NULLSTARTBLOCK((int)temp2));
+		xfs_bmap_trace_post_update(fname, "0", ip, idx + 2,
+			XFS_DATA_FORK);
+		*dnew = temp + temp2;
+		break;
+
+	case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+	case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+	case MASK2(LEFT_FILLING, RIGHT_CONTIG):
+	case MASK2(RIGHT_FILLING, LEFT_CONTIG):
+	case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
+	case MASK(LEFT_CONTIG):
+	case MASK(RIGHT_CONTIG):
+		/*
+		 * These cases are all impossible.
+		 */
+		ASSERT(0);
+	}
+	*curp = cur;
+done:
+	*logflagsp = rval;
+	return error;
+#undef	LEFT
+#undef	RIGHT
+#undef	PREV
+#undef	MASK
+#undef	MASK2
+#undef	MASK3
+#undef	MASK4
+#undef	STATE_SET
+#undef	STATE_TEST
+#undef	STATE_SET_TEST
+#undef	SWITCH_STATE
+}
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting an unwritten
+ * allocation to a real allocation or vice versa.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_unwritten_real(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		**curp, /* if *curp is null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to put in extent list */
+	int			*logflagsp) /* inode logging flags */
+{
+	xfs_bmbt_rec_t		*base;	/* base of extent entry list */
+	xfs_btree_cur_t		*cur;	/* btree cursor */
+	xfs_bmbt_rec_t		*ep;	/* extent entry for idx */
+	int			error;	/* error return value */
+#ifdef XFS_BMAP_TRACE
+	static char		fname[] = "xfs_bmap_add_extent_unwritten_real";
+#endif
+	int			i;	/* temp state */
+	xfs_fileoff_t		new_endoff;	/* end offset of new entry */
+	xfs_exntst_t		newext; /* new extent state */
+	xfs_exntst_t		oldext; /* old extent state */
+	xfs_bmbt_irec_t		r[3];	/* neighbor extent entries */
+					/* left is 0, right is 1, prev is 2 */
+	int			rval=0; /* return value (logging flags) */
+	int			state = 0;/* state bits, accessed thru macros */
+	enum {				/* bit number definitions for state */
+		LEFT_CONTIG,	RIGHT_CONTIG,
+		LEFT_FILLING,	RIGHT_FILLING,
+		LEFT_DELAY,	RIGHT_DELAY,
+		LEFT_VALID,	RIGHT_VALID
+	};
+
+#define LEFT		r[0]
+#define RIGHT		r[1]
+#define PREV		r[2]
+#define MASK(b)		(1 << (b))
+#define MASK2(a,b)	(MASK(a) | MASK(b))
+#define MASK3(a,b,c)	(MASK2(a,b) | MASK(c))
+#define MASK4(a,b,c,d)	(MASK3(a,b,c) | MASK(d))
+#define STATE_SET(b,v)	((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
+#define STATE_TEST(b)	(state & MASK(b))
+#define STATE_SET_TEST(b,v)	((v) ? ((state |= MASK(b)), 1) : \
+				       ((state &= ~MASK(b)), 0))
+#define SWITCH_STATE		\
+	(state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG))
+
+	/*
+	 * Set up a bunch of variables to make the tests simpler.
+	 */
+	error = 0;
+	cur = *curp;
+	base = ip->i_df.if_u1.if_extents;
+	ep = &base[idx];
+	xfs_bmbt_get_all(ep, &PREV);
+	newext = new->br_state;
+	oldext = (newext == XFS_EXT_UNWRITTEN) ?
+		XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+	ASSERT(PREV.br_state == oldext);
+	new_endoff = new->br_startoff + new->br_blockcount;
+	ASSERT(PREV.br_startoff <= new->br_startoff);
+	ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff);
+	/*
+	 * Set flags determining what part of the previous oldext allocation
+	 * extent is being replaced by a newext allocation.
+	 */
+	STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff);
+	STATE_SET(RIGHT_FILLING,
+		PREV.br_startoff + PREV.br_blockcount == new_endoff);
+	/*
+	 * Check and set flags if this segment has a left neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 */
+	if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
+		xfs_bmbt_get_all(ep - 1, &LEFT);
+		STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock));
+	}
+	STATE_SET(LEFT_CONTIG,
+		STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
+		LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
+		LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
+		LEFT.br_state == newext &&
+		LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN);
+	/*
+	 * Check and set flags if this segment has a right neighbor.
+	 * Don't set contiguous if the combined extent would be too large.
+	 * Also check for all-three-contiguous being too large.
+	 */
+	if (STATE_SET_TEST(RIGHT_VALID,
+			idx <
+			ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
+		xfs_bmbt_get_all(ep + 1, &RIGHT);
+		STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock));
+	}
+	STATE_SET(RIGHT_CONTIG,
+		STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
+		new_endoff == RIGHT.br_startoff &&
+		new->br_startblock + new->br_blockcount ==
+		    RIGHT.br_startblock &&
+		newext == RIGHT.br_state &&
+		new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+		((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) !=
+		  MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) ||
+		 LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
+		     <= MAXEXTLEN));
+	/*
+	 * Switch out based on the FILLING and CONTIG state bits.
+	 */
+	switch (SWITCH_STATE) {
+
+	case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The left and right neighbors are both contiguous with new.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF|RF|LC|RC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep - 1,
+			LEFT.br_blockcount + PREV.br_blockcount +
+			RIGHT.br_blockcount);
+		xfs_bmap_trace_post_update(fname, "LF|RF|LC|RC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmap_trace_delete(fname, "LF|RF|LC|RC", ip, idx, 2,
+			XFS_DATA_FORK);
+		xfs_bmap_delete_exlist(ip, idx, 2, XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx - 1;
+		ip->i_d.di_nextents -= 2;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_delete(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_delete(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+				LEFT.br_startblock,
+				LEFT.br_blockcount + PREV.br_blockcount +
+				RIGHT.br_blockcount, LEFT.br_state)))
+				goto done;
+		}
+		break;
+
+	case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG):
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The left neighbor is contiguous, the right is not.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF|RF|LC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep - 1,
+			LEFT.br_blockcount + PREV.br_blockcount);
+		xfs_bmap_trace_post_update(fname, "LF|RF|LC", ip, idx - 1,
+			XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx - 1;
+		xfs_bmap_trace_delete(fname, "LF|RF|LC", ip, idx, 1,
+			XFS_DATA_FORK);
+		xfs_bmap_delete_exlist(ip, idx, 1, XFS_DATA_FORK);
+		ip->i_d.di_nextents--;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_delete(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
+				LEFT.br_startblock,
+				LEFT.br_blockcount + PREV.br_blockcount,
+				LEFT.br_state)))
+				goto done;
+		}
+		break;
+
+	case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG):
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The right neighbor is contiguous, the left is not.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF|RF|RC", ip, idx,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount + RIGHT.br_blockcount);
+		xfs_bmbt_set_state(ep, newext);
+		xfs_bmap_trace_post_update(fname, "LF|RF|RC", ip, idx,
+			XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx;
+		xfs_bmap_trace_delete(fname, "LF|RF|RC", ip, idx + 1, 1,
+			XFS_DATA_FORK);
+		xfs_bmap_delete_exlist(ip, idx + 1, 1, XFS_DATA_FORK);
+		ip->i_d.di_nextents--;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff,
+					RIGHT.br_startblock,
+					RIGHT.br_blockcount, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_delete(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+				new->br_startblock,
+				new->br_blockcount + RIGHT.br_blockcount,
+				newext)))
+				goto done;
+		}
+		break;
+
+	case MASK2(LEFT_FILLING, RIGHT_FILLING):
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * Neither the left nor right neighbors are contiguous with
+		 * the new one.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF|RF", ip, idx,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_state(ep, newext);
+		xfs_bmap_trace_post_update(fname, "LF|RF", ip, idx,
+			XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx;
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+				new->br_startblock, new->br_blockcount,
+				newext)))
+				goto done;
+		}
+		break;
+
+	case MASK2(LEFT_FILLING, LEFT_CONTIG):
+		/*
+		 * Setting the first part of a previous oldext extent to newext.
+		 * The left neighbor is contiguous.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep - 1,
+			LEFT.br_blockcount + new->br_blockcount);
+		xfs_bmbt_set_startoff(ep,
+			PREV.br_startoff + new->br_blockcount);
+		xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_startblock(ep,
+			new->br_startblock + new->br_blockcount);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount - new->br_blockcount);
+		xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx,
+			XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx - 1;
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur,
+				PREV.br_startoff + new->br_blockcount,
+				PREV.br_startblock + new->br_blockcount,
+				PREV.br_blockcount - new->br_blockcount,
+				oldext)))
+				goto done;
+			if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+				goto done;
+			if (xfs_bmbt_update(cur, LEFT.br_startoff,
+				LEFT.br_startblock,
+				LEFT.br_blockcount + new->br_blockcount,
+				LEFT.br_state))
+				goto done;
+		}
+		break;
+
+	case MASK(LEFT_FILLING):
+		/*
+		 * Setting the first part of a previous oldext extent to newext.
+		 * The left neighbor is not contiguous.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LF", ip, idx, XFS_DATA_FORK);
+		ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
+		xfs_bmbt_set_startoff(ep, new_endoff);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount - new->br_blockcount);
+		xfs_bmbt_set_startblock(ep,
+			new->br_startblock + new->br_blockcount);
+		xfs_bmap_trace_post_update(fname, "LF", ip, idx, XFS_DATA_FORK);
+		xfs_bmap_trace_insert(fname, "LF", ip, idx, 1, new, NULL,
+			XFS_DATA_FORK);
+		xfs_bmap_insert_exlist(ip, idx, 1, new, XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx;
+		ip->i_d.di_nextents++;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur,
+				PREV.br_startoff + new->br_blockcount,
+				PREV.br_startblock + new->br_blockcount,
+				PREV.br_blockcount - new->br_blockcount,
+				oldext)))
+				goto done;
+			cur->bc_rec.b = *new;
+			if ((error = xfs_bmbt_insert(cur, &i)))
+				goto done;
+			ASSERT(i == 1);
+		}
+		break;
+
+	case MASK2(RIGHT_FILLING, RIGHT_CONTIG):
+		/*
+		 * Setting the last part of a previous oldext extent to newext.
+		 * The right neighbor is contiguous with the new allocation.
+		 */
+		xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx,
+			XFS_DATA_FORK);
+		xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx + 1,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount - new->br_blockcount);
+		xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_allf(ep + 1, new->br_startoff, new->br_startblock,
+			new->br_blockcount + RIGHT.br_blockcount, newext);
+		xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx + 1,
+			XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx + 1;
+		if (cur == NULL)
+			rval = XFS_ILOG_DEXT;
+		else {
+			rval = 0;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock,
+					PREV.br_blockcount, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+				PREV.br_startblock,
+				PREV.br_blockcount - new->br_blockcount,
+				oldext)))
+				goto done;
+			if ((error = xfs_bmbt_increment(cur, 0, &i)))
+				goto done;
+			if ((error = xfs_bmbt_update(cur, new->br_startoff,
+				new->br_startblock,
+				new->br_blockcount + RIGHT.br_blockcount,
+				newext)))
+				goto done;
+		}
+		break;
+
+	case MASK(RIGHT_FILLING):
+		/*
+		 * Setting the last part of a previous oldext extent to newext.
+		 * The right neighbor is not contiguous.
+		 */
+		xfs_bmap_trace_pre_update(fname, "RF", ip, idx, XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep,
+			PREV.br_blockcount - new->br_blockcount);
+		xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK);
+		xfs_bmap_trace_insert(fname, "RF", ip, idx + 1, 1,
+			new, NULL, XFS_DATA_FORK);
+		xfs_bmap_insert_exlist(ip, idx + 1, 1, new, XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx + 1;
+		ip->i_d.di_nextents++;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
+				PREV.br_startblock,
+				PREV.br_blockcount - new->br_blockcount,
+				oldext)))
+				goto done;
+			if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+					new->br_startblock, new->br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 0);
+			cur->bc_rec.b.br_state = XFS_EXT_NORM;
+			if ((error = xfs_bmbt_insert(cur, &i)))
+				goto done;
+			ASSERT(i == 1);
+		}
+		break;
+
+	case 0:
+		/*
+		 * Setting the middle part of a previous oldext extent to
+		 * newext.  Contiguity is impossible here.
+		 * One extent becomes three extents.
+		 */
+		xfs_bmap_trace_pre_update(fname, "0", ip, idx, XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep,
+			new->br_startoff - PREV.br_startoff);
+		xfs_bmap_trace_post_update(fname, "0", ip, idx, XFS_DATA_FORK);
+		r[0] = *new;
+		r[1].br_startoff = new_endoff;
+		r[1].br_blockcount =
+			PREV.br_startoff + PREV.br_blockcount - new_endoff;
+		r[1].br_startblock = new->br_startblock + new->br_blockcount;
+		r[1].br_state = oldext;
+		xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 2, &r[0], &r[1],
+			XFS_DATA_FORK);
+		xfs_bmap_insert_exlist(ip, idx + 1, 2, &r[0], XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx + 1;
+		ip->i_d.di_nextents += 2;
+		if (cur == NULL)
+			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+		else {
+			rval = XFS_ILOG_CORE;
+			if ((error = xfs_bmbt_lookup_eq(cur, PREV.br_startoff,
+					PREV.br_startblock, PREV.br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 1);
+			/* new right extent - oldext */
+			if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
+				r[1].br_startblock, r[1].br_blockcount,
+				r[1].br_state)))
+				goto done;
+			/* new left extent - oldext */
+			PREV.br_blockcount =
+				new->br_startoff - PREV.br_startoff;
+			cur->bc_rec.b = PREV;
+			if ((error = xfs_bmbt_insert(cur, &i)))
+				goto done;
+			ASSERT(i == 1);
+			if ((error = xfs_bmbt_increment(cur, 0, &i)))
+				goto done;
+			ASSERT(i == 1);
+			/* new middle extent - newext */
+			cur->bc_rec.b = *new;
+			if ((error = xfs_bmbt_insert(cur, &i)))
+				goto done;
+			ASSERT(i == 1);
+		}
+		break;
+
+	case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+	case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG):
+	case MASK2(LEFT_FILLING, RIGHT_CONTIG):
+	case MASK2(RIGHT_FILLING, LEFT_CONTIG):
+	case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
+	case MASK(LEFT_CONTIG):
+	case MASK(RIGHT_CONTIG):
+		/*
+		 * These cases are all impossible.
+		 */
+		ASSERT(0);
+	}
+	*curp = cur;
+done:
+	*logflagsp = rval;
+	return error;
+#undef	LEFT
+#undef	RIGHT
+#undef	PREV
+#undef	MASK
+#undef	MASK2
+#undef	MASK3
+#undef	MASK4
+#undef	STATE_SET
+#undef	STATE_TEST
+#undef	STATE_SET_TEST
+#undef	SWITCH_STATE
+}
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a hole
+ * to a delayed allocation.
+ */
+/*ARGSUSED*/
+STATIC int				/* error */
+xfs_bmap_add_extent_hole_delay(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		*cur,	/* if null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to put in extent list */
+	int			*logflagsp, /* inode logging flags */
+	int			rsvd)		/* OK to allocate reserved blocks */
+{
+	xfs_bmbt_rec_t		*base;	/* base of extent entry list */
+	xfs_bmbt_rec_t		*ep;	/* extent list entry for idx */
+#ifdef XFS_BMAP_TRACE
+	static char		fname[] = "xfs_bmap_add_extent_hole_delay";
+#endif
+	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
+	xfs_filblks_t		newlen=0;	/* new indirect size */
+	xfs_filblks_t		oldlen=0;	/* old indirect size */
+	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
+	int			state;	/* state bits, accessed thru macros */
+	xfs_filblks_t		temp;	/* temp for indirect calculations */
+	enum {				/* bit number definitions for state */
+		LEFT_CONTIG,	RIGHT_CONTIG,
+		LEFT_DELAY,	RIGHT_DELAY,
+		LEFT_VALID,	RIGHT_VALID
+	};
+
+#define MASK(b)			(1 << (b))
+#define MASK2(a,b)		(MASK(a) | MASK(b))
+#define STATE_SET(b,v)		((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
+#define STATE_TEST(b)		(state & MASK(b))
+#define STATE_SET_TEST(b,v)	((v) ? ((state |= MASK(b)), 1) : \
+				       ((state &= ~MASK(b)), 0))
+#define SWITCH_STATE		(state & MASK2(LEFT_CONTIG, RIGHT_CONTIG))
+
+	base = ip->i_df.if_u1.if_extents;
+	ep = &base[idx];
+	state = 0;
+	ASSERT(ISNULLSTARTBLOCK(new->br_startblock));
+	/*
+	 * Check and set flags if this segment has a left neighbor
+	 */
+	if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
+		xfs_bmbt_get_all(ep - 1, &left);
+		STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock));
+	}
+	/*
+	 * Check and set flags if the current (right) segment exists.
+	 * If it doesn't exist, we're converting the hole at end-of-file.
+	 */
+	if (STATE_SET_TEST(RIGHT_VALID,
+			   idx <
+			   ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
+		xfs_bmbt_get_all(ep, &right);
+		STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock));
+	}
+	/*
+	 * Set contiguity flags on the left and right neighbors.
+	 * Don't let extents get too large, even if the pieces are contiguous.
+	 */
+	STATE_SET(LEFT_CONTIG,
+		STATE_TEST(LEFT_VALID) && STATE_TEST(LEFT_DELAY) &&
+		left.br_startoff + left.br_blockcount == new->br_startoff &&
+		left.br_blockcount + new->br_blockcount <= MAXEXTLEN);
+	STATE_SET(RIGHT_CONTIG,
+		STATE_TEST(RIGHT_VALID) && STATE_TEST(RIGHT_DELAY) &&
+		new->br_startoff + new->br_blockcount == right.br_startoff &&
+		new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+		(!STATE_TEST(LEFT_CONTIG) ||
+		 (left.br_blockcount + new->br_blockcount +
+		     right.br_blockcount <= MAXEXTLEN)));
+	/*
+	 * Switch out based on the contiguity flags.
+	 */
+	switch (SWITCH_STATE) {
+
+	case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
+		/*
+		 * New allocation is contiguous with delayed allocations
+		 * on the left and on the right.
+		 * Merge all three into a single extent list entry.
+		 */
+		temp = left.br_blockcount + new->br_blockcount +
+			right.br_blockcount;
+		xfs_bmap_trace_pre_update(fname, "LC|RC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep - 1, temp);
+		oldlen = STARTBLOCKVAL(left.br_startblock) +
+			STARTBLOCKVAL(new->br_startblock) +
+			STARTBLOCKVAL(right.br_startblock);
+		newlen = xfs_bmap_worst_indlen(ip, temp);
+		xfs_bmbt_set_startblock(ep - 1, NULLSTARTBLOCK((int)newlen));
+		xfs_bmap_trace_post_update(fname, "LC|RC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmap_trace_delete(fname, "LC|RC", ip, idx, 1,
+			XFS_DATA_FORK);
+		xfs_bmap_delete_exlist(ip, idx, 1, XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx - 1;
+		break;
+
+	case MASK(LEFT_CONTIG):
+		/*
+		 * New allocation is contiguous with a delayed allocation
+		 * on the left.
+		 * Merge the new allocation with the left neighbor.
+		 */
+		temp = left.br_blockcount + new->br_blockcount;
+		xfs_bmap_trace_pre_update(fname, "LC", ip, idx - 1,
+			XFS_DATA_FORK);
+		xfs_bmbt_set_blockcount(ep - 1, temp);
+		oldlen = STARTBLOCKVAL(left.br_startblock) +
+			STARTBLOCKVAL(new->br_startblock);
+		newlen = xfs_bmap_worst_indlen(ip, temp);
+		xfs_bmbt_set_startblock(ep - 1, NULLSTARTBLOCK((int)newlen));
+		xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1,
+			XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx - 1;
+		break;
+
+	case MASK(RIGHT_CONTIG):
+		/*
+		 * New allocation is contiguous with a delayed allocation
+		 * on the right.
+		 * Merge the new allocation with the right neighbor.
+		 */
+		xfs_bmap_trace_pre_update(fname, "RC", ip, idx, XFS_DATA_FORK);
+		temp = new->br_blockcount + right.br_blockcount;
+		oldlen = STARTBLOCKVAL(new->br_startblock) +
+			STARTBLOCKVAL(right.br_startblock);
+		newlen = xfs_bmap_worst_indlen(ip, temp);
+		xfs_bmbt_set_allf(ep, new->br_startoff,
+			NULLSTARTBLOCK((int)newlen), temp, right.br_state);
+		xfs_bmap_trace_post_update(fname, "RC", ip, idx, XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx;
+		break;
+
+	case 0:
+		/*
+		 * New allocation is not contiguous with another
+		 * delayed allocation.
+		 * Insert a new entry.
+		 */
+		oldlen = newlen = 0;
+		xfs_bmap_trace_insert(fname, "0", ip, idx, 1, new, NULL,
+			XFS_DATA_FORK);
+		xfs_bmap_insert_exlist(ip, idx, 1, new, XFS_DATA_FORK);
+		ip->i_df.if_lastex = idx;
+		break;
+	}
+	if (oldlen != newlen) {
+		ASSERT(oldlen > newlen);
+		xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS,
+			(int)(oldlen - newlen), rsvd);
+		/*
+		 * Nothing to do for disk quota accounting here.
+		 */
+	}
+	*logflagsp = 0;
+	return 0;
+#undef	MASK
+#undef	MASK2
+#undef	STATE_SET
+#undef	STATE_TEST
+#undef	STATE_SET_TEST
+#undef	SWITCH_STATE
+}
+
+/*
+ * Called by xfs_bmap_add_extent to handle cases converting a hole
+ * to a real allocation.
+ */
+STATIC int				/* error */
+xfs_bmap_add_extent_hole_real(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_extnum_t		idx,	/* extent number to update/insert */
+	xfs_btree_cur_t		*cur,	/* if null, not a btree */
+	xfs_bmbt_irec_t		*new,	/* new data to put in extent list */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork) /* data or attr fork */
+{
+	xfs_bmbt_rec_t		*ep;	/* pointer to extent entry ins. point */
+	int			error;	/* error return value */
+#ifdef XFS_BMAP_TRACE
+	static char		fname[] = "xfs_bmap_add_extent_hole_real";
+#endif
+	int			i;	/* temp state */
+	xfs_ifork_t		*ifp;	/* inode fork pointer */
+	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
+	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
+	int			state;	/* state bits, accessed thru macros */
+	enum {				/* bit number definitions for state */
+		LEFT_CONTIG,	RIGHT_CONTIG,
+		LEFT_DELAY,	RIGHT_DELAY,
+		LEFT_VALID,	RIGHT_VALID
+	};
+
+#define MASK(b)			(1 << (b))
+#define MASK2(a,b)		(MASK(a) | MASK(b))
+#define STATE_SET(b,v)		((v) ? (state |= MASK(b)) : (state &= ~MASK(b)))
+#define STATE_TEST(b)		(state & MASK(b))
+#define STATE_SET_TEST(b,v)	((v) ? ((state |= MASK(b)), 1) : \
+				       ((state &= ~MASK(b)), 0))
+#define SWITCH_STATE		(state & MASK2(LEFT_CONTIG, RIGHT_CONTIG))
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
+	ep = &ifp->if_u1.if_extents[idx];
+	state = 0;
+	/*
+	 * Check and set flags if this segment has a left neighbor.
+	 */
+	if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
+		xfs_bmbt_get_all(ep - 1, &left);
+		STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock));
+	}
+	/*
+	 * Check and set flags if this segment has a current value.
+	 * Not true if we're inserting into the "hole" at eof.
+	 */
+	if (STATE_SET_TEST(RIGHT_VALID,
+			   idx <
+			   ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
+		xfs_bmbt_get_all(ep, &right);
+		STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock));
+	}
+	/*
+	 * We're inserting a real allocation between "left" and "right".
+	 * Set the contiguity flags.  Don't let extents get too large.
+	 */
+	STATE_SET(LEFT_CONTIG,
+		STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
+		left.br_startoff + left.br_blockcount == new->br_startoff &&
+		left.br_startblock + left.br_blockcount == new->br_startblock &&
+		left.br_state == new->br_state &&
+		left.br_blockcount + new->br_blockcount <= MAXEXTLEN);
+	STATE_SET(RIGHT_CONTIG,
+		STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
+		new->br_startoff + new->br_blockcount == right.br_startoff &&
+		new->br_startblock + new->br_blockcount ==
+		    right.br_startblock &&
+		new->br_state == right.br_state &&
+		new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+		(!STATE_TEST(LEFT_CONTIG) ||
+		 left.br_blockcount + new->br_blockcount +
+		     right.br_blockcount <= MAXEXTLEN));
+
+	/*
+	 * Select which case we're in here, and implement it.
+	 */
+	switch (SWITCH_STATE) {
+
+	case MASK2(LEFT_CONTIG, RIGHT_CONTIG):
+		/*
+		 * New allocation is contiguous with real allocations on the
+		 * left and on the right.
+		 * Merge all three into a single extent list entry.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LC|RC", ip, idx - 1,
+			whichfork);
+		xfs_bmbt_set_blockcount(ep - 1,
+			left.br_blockcount + new->br_blockcount +
+			right.br_blockcount);
+		xfs_bmap_trace_post_update(fname, "LC|RC", ip, idx - 1,
+			whichfork);
+		xfs_bmap_trace_delete(fname, "LC|RC", ip,
+			idx, 1, whichfork);
+		xfs_bmap_delete_exlist(ip, idx, 1, whichfork);
+		ifp->if_lastex = idx - 1;
+		XFS_IFORK_NEXT_SET(ip, whichfork,
+			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+		if (cur == NULL) {
+			*logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+			return 0;
+		}
+		*logflagsp = XFS_ILOG_CORE;
+		if ((error = xfs_bmbt_lookup_eq(cur, right.br_startoff,
+				right.br_startblock, right.br_blockcount, &i)))
+			return error;
+		ASSERT(i == 1);
+		if ((error = xfs_bmbt_delete(cur, 0, &i)))
+			return error;
+		ASSERT(i == 1);
+		if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+			return error;
+		ASSERT(i == 1);
+		error = xfs_bmbt_update(cur, left.br_startoff,
+				left.br_startblock,
+				left.br_blockcount + new->br_blockcount +
+				right.br_blockcount, left.br_state);
+		return error;
+
+	case MASK(LEFT_CONTIG):
+		/*
+		 * New allocation is contiguous with a real allocation
+		 * on the left.
+		 * Merge the new allocation with the left neighbor.
+		 */
+		xfs_bmap_trace_pre_update(fname, "LC", ip, idx - 1, whichfork);
+		xfs_bmbt_set_blockcount(ep - 1,
+			left.br_blockcount + new->br_blockcount);
+		xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1, whichfork);
+		ifp->if_lastex = idx - 1;
+		if (cur == NULL) {
+			*logflagsp = XFS_ILOG_FEXT(whichfork);
+			return 0;
+		}
+		*logflagsp = 0;
+		if ((error = xfs_bmbt_lookup_eq(cur, left.br_startoff,
+				left.br_startblock, left.br_blockcount, &i)))
+			return error;
+		ASSERT(i == 1);
+		error = xfs_bmbt_update(cur, left.br_startoff,
+				left.br_startblock,
+				left.br_blockcount + new->br_blockcount,
+				left.br_state);
+		return error;
+
+	case MASK(RIGHT_CONTIG):
+		/*
+		 * New allocation is contiguous with a real allocation
+		 * on the right.
+		 * Merge the new allocation with the right neighbor.
+		 */
+		xfs_bmap_trace_pre_update(fname, "RC", ip, idx, whichfork);
+		xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock,
+			new->br_blockcount + right.br_blockcount,
+			right.br_state);
+		xfs_bmap_trace_post_update(fname, "RC", ip, idx, whichfork);
+		ifp->if_lastex = idx;
+		if (cur == NULL) {
+			*logflagsp = XFS_ILOG_FEXT(whichfork);
+			return 0;
+		}
+		*logflagsp = 0;
+		if ((error = xfs_bmbt_lookup_eq(cur, right.br_startoff,
+				right.br_startblock, right.br_blockcount, &i)))
+			return error;
+		ASSERT(i == 1);
+		error = xfs_bmbt_update(cur, new->br_startoff,
+				new->br_startblock,
+				new->br_blockcount + right.br_blockcount,
+				right.br_state);
+		return error;
+
+	case 0:
+		/*
+		 * New allocation is not contiguous with another
+		 * real allocation.
+		 * Insert a new entry.
+		 */
+		xfs_bmap_trace_insert(fname, "0", ip, idx, 1, new, NULL,
+			whichfork);
+		xfs_bmap_insert_exlist(ip, idx, 1, new, whichfork);
+		ifp->if_lastex = idx;
+		XFS_IFORK_NEXT_SET(ip, whichfork,
+			XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+		if (cur == NULL) {
+			*logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+			return 0;
+		}
+		*logflagsp = XFS_ILOG_CORE;
+		if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+				new->br_startblock, new->br_blockcount, &i)))
+			return error;
+		ASSERT(i == 0);
+		cur->bc_rec.b.br_state = new->br_state;
+		if ((error = xfs_bmbt_insert(cur, &i)))
+			return error;
+		ASSERT(i == 1);
+		return 0;
+	}
+#undef	MASK
+#undef	MASK2
+#undef	STATE_SET
+#undef	STATE_TEST
+#undef	STATE_SET_TEST
+#undef	SWITCH_STATE
+	/* NOTREACHED */
+	ASSERT(0);
+	return 0; /* keep gcc quite */
+}
+
+#define XFS_ALLOC_GAP_UNITS	4
+
+/*
+ * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
+ * It figures out where to ask the underlying allocator to put the new extent.
+ */
+STATIC int				/* error */
+xfs_bmap_alloc(
+	xfs_bmalloca_t	*ap)		/* bmap alloc argument struct */
+{
+	xfs_fsblock_t	adjust;		/* adjustment to block numbers */
+	xfs_alloctype_t atype=0;	/* type for allocation routines */
+	int		error;		/* error return value */
+	xfs_agnumber_t	fb_agno;	/* ag number of ap->firstblock */
+	xfs_mount_t	*mp;		/* mount point structure */
+	int		nullfb;		/* true if ap->firstblock isn't set */
+	int		rt;		/* true if inode is realtime */
+#ifdef __KERNEL__
+	xfs_extlen_t	prod=0;		/* product factor for allocators */
+	xfs_extlen_t	ralen=0;	/* realtime allocation length */
+#endif
+
+#define ISLEGAL(x,y)	\
+	(rt ? \
+		(x) < mp->m_sb.sb_rblocks : \
+		XFS_FSB_TO_AGNO(mp, x) == XFS_FSB_TO_AGNO(mp, y) && \
+		XFS_FSB_TO_AGNO(mp, x) < mp->m_sb.sb_agcount && \
+		XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks)
+
+	/*
+	 * Set up variables.
+	 */
+	mp = ap->ip->i_mount;
+	nullfb = ap->firstblock == NULLFSBLOCK;
+	rt = (ap->ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && ap->userdata;
+	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock);
+#ifdef __KERNEL__
+	if (rt) {
+		xfs_extlen_t	extsz;		/* file extent size for rt */
+		xfs_fileoff_t	nexto;		/* next file offset */
+		xfs_extlen_t	orig_alen;	/* original ap->alen */
+		xfs_fileoff_t	orig_end;	/* original off+len */
+		xfs_fileoff_t	orig_off;	/* original ap->off */
+		xfs_extlen_t	mod_off;	/* modulus calculations */
+		xfs_fileoff_t	prevo;		/* previous file offset */
+		xfs_rtblock_t	rtx;		/* realtime extent number */
+		xfs_extlen_t	temp;		/* temp for rt calculations */
+
+		/*
+		 * Set prod to match the realtime extent size.
+		 */
+		if (!(extsz = ap->ip->i_d.di_extsize))
+			extsz = mp->m_sb.sb_rextsize;
+		prod = extsz / mp->m_sb.sb_rextsize;
+		orig_off = ap->off;
+		orig_alen = ap->alen;
+		orig_end = orig_off + orig_alen;
+		/*
+		 * If the file offset is unaligned vs. the extent size
+		 * we need to align it.	 This will be possible unless
+		 * the file was previously written with a kernel that didn't
+		 * perform this alignment.
+		 */
+		mod_off = do_mod(orig_off, extsz);
+		if (mod_off) {
+			ap->alen += mod_off;
+			ap->off -= mod_off;
+		}
+		/*
+		 * Same adjustment for the end of the requested area.
+		 */
+		if ((temp = (ap->alen % extsz)))
+			ap->alen += extsz - temp;
+		/*
+		 * If the previous block overlaps with this proposed allocation
+		 * then move the start forward without adjusting the length.
+		 */
+		prevo =
+			ap->prevp->br_startoff == NULLFILEOFF ?
+				0 :
+				(ap->prevp->br_startoff +
+				 ap->prevp->br_blockcount);
+		if (ap->off != orig_off && ap->off < prevo)
+			ap->off = prevo;
+		/*
+		 * If the next block overlaps with this proposed allocation
+		 * then move the start back without adjusting the length,
+		 * but not before offset 0.
+		 * This may of course make the start overlap previous block,
+		 * and if we hit the offset 0 limit then the next block
+		 * can still overlap too.
+		 */
+		nexto = (ap->eof || ap->gotp->br_startoff == NULLFILEOFF) ?
+			NULLFILEOFF : ap->gotp->br_startoff;
+		if (!ap->eof &&
+		    ap->off + ap->alen != orig_end &&
+		    ap->off + ap->alen > nexto)
+			ap->off = nexto > ap->alen ? nexto - ap->alen : 0;
+		/*
+		 * If we're now overlapping the next or previous extent that
+		 * means we can't fit an extsz piece in this hole.  Just move
+		 * the start forward to the first legal spot and set
+		 * the length so we hit the end.
+		 */
+		if ((ap->off != orig_off && ap->off < prevo) ||
+		    (ap->off + ap->alen != orig_end &&
+		     ap->off + ap->alen > nexto)) {
+			ap->off = prevo;
+			ap->alen = nexto - prevo;
+		}
+		/*
+		 * If the result isn't a multiple of rtextents we need to
+		 * remove blocks until it is.
+		 */
+		if ((temp = (ap->alen % mp->m_sb.sb_rextsize))) {
+			/*
+			 * We're not covering the original request, or
+			 * we won't be able to once we fix the length.
+			 */
+			if (orig_off < ap->off ||
+			    orig_end > ap->off + ap->alen ||
+			    ap->alen - temp < orig_alen)
+				return XFS_ERROR(EINVAL);
+			/*
+			 * Try to fix it by moving the start up.
+			 */
+			if (ap->off + temp <= orig_off) {
+				ap->alen -= temp;
+				ap->off += temp;
+			}
+			/*
+			 * Try to fix it by moving the end in.
+			 */
+			else if (ap->off + ap->alen - temp >= orig_end)
+				ap->alen -= temp;
+			/*
+			 * Set the start to the minimum then trim the length.
+			 */
+			else {
+				ap->alen -= orig_off - ap->off;
+				ap->off = orig_off;
+				ap->alen -= ap->alen % mp->m_sb.sb_rextsize;
+			}
+			/*
+			 * Result doesn't cover the request, fail it.
+			 */
+			if (orig_off < ap->off || orig_end > ap->off + ap->alen)
+				return XFS_ERROR(EINVAL);
+		}
+		ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0);
+		/*
+		 * If the offset & length are not perfectly aligned
+		 * then kill prod, it will just get us in trouble.
+		 */
+		if (do_mod(ap->off, extsz) || ap->alen % extsz)
+			prod = 1;
+		/*
+		 * Set ralen to be the actual requested length in rtextents.
+		 */
+		ralen = ap->alen / mp->m_sb.sb_rextsize;
+		/*
+		 * If the old value was close enough to MAXEXTLEN that
+		 * we rounded up to it, cut it back so it's legal again.
+		 * Note that if it's a really large request (bigger than
+		 * MAXEXTLEN), we don't hear about that number, and can't
+		 * adjust the starting point to match it.
+		 */
+		if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
+			ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
+		/*
+		 * If it's an allocation to an empty file at offset 0,
+		 * pick an extent that will space things out in the rt area.
+		 */
+		if (ap->eof && ap->off == 0) {
+			error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
+			if (error)
+				return error;
+			ap->rval = rtx * mp->m_sb.sb_rextsize;
+		} else
+			ap->rval = 0;
+	}
+#else
+	if (rt)
+		ap->rval = 0;
+#endif	/* __KERNEL__ */
+	else if (nullfb)
+		ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino);
+	else
+		ap->rval = ap->firstblock;
+	/*
+	 * If allocating at eof, and there's a previous real block,
+	 * try to use it's last block as our starting point.
+	 */
+	if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF &&
+	    !ISNULLSTARTBLOCK(ap->prevp->br_startblock) &&
+	    ISLEGAL(ap->prevp->br_startblock + ap->prevp->br_blockcount,
+		    ap->prevp->br_startblock)) {
+		ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount;
+		/*
+		 * Adjust for the gap between prevp and us.
+		 */
+		adjust = ap->off -
+			(ap->prevp->br_startoff + ap->prevp->br_blockcount);
+		if (adjust &&
+		    ISLEGAL(ap->rval + adjust, ap->prevp->br_startblock))
+			ap->rval += adjust;
+	}
+	/*
+	 * If not at eof, then compare the two neighbor blocks.
+	 * Figure out whether either one gives us a good starting point,
+	 * and pick the better one.
+	 */
+	else if (!ap->eof) {
+		xfs_fsblock_t	gotbno;		/* right side block number */
+		xfs_fsblock_t	gotdiff=0;	/* right side difference */
+		xfs_fsblock_t	prevbno;	/* left side block number */
+		xfs_fsblock_t	prevdiff=0;	/* left side difference */
+
+		/*
+		 * If there's a previous (left) block, select a requested
+		 * start block based on it.
+		 */
+		if (ap->prevp->br_startoff != NULLFILEOFF &&
+		    !ISNULLSTARTBLOCK(ap->prevp->br_startblock) &&
+		    (prevbno = ap->prevp->br_startblock +
+			       ap->prevp->br_blockcount) &&
+		    ISLEGAL(prevbno, ap->prevp->br_startblock)) {
+			/*
+			 * Calculate gap to end of previous block.
+			 */
+			adjust = prevdiff = ap->off -
+				(ap->prevp->br_startoff +
+				 ap->prevp->br_blockcount);
+			/*
+			 * Figure the startblock based on the previous block's
+			 * end and the gap size.
+			 * Heuristic!
+			 * If the gap is large relative to the piece we're
+			 * allocating, or using it gives us an illegal block
+			 * number, then just use the end of the previous block.
+			 */
+			if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->alen &&
+			    ISLEGAL(prevbno + prevdiff,
+				    ap->prevp->br_startblock))
+				prevbno += adjust;
+			else
+				prevdiff += adjust;
+			/*
+			 * If the firstblock forbids it, can't use it,
+			 * must use default.
+			 */
+			if (!rt && !nullfb &&
+			    XFS_FSB_TO_AGNO(mp, prevbno) != fb_agno)
+				prevbno = NULLFSBLOCK;
+		}
+		/*
+		 * No previous block or can't follow it, just default.
+		 */
+		else
+			prevbno = NULLFSBLOCK;
+		/*
+		 * If there's a following (right) block, select a requested
+		 * start block based on it.
+		 */
+		if (!ISNULLSTARTBLOCK(ap->gotp->br_startblock)) {
+			/*
+			 * Calculate gap to start of next block.
+			 */
+			adjust = gotdiff = ap->gotp->br_startoff - ap->off;
+			/*
+			 * Figure the startblock based on the next block's
+			 * start and the gap size.
+			 */
+			gotbno = ap->gotp->br_startblock;
+			/*
+			 * Heuristic!
+			 * If the gap is large relative to the piece we're
+			 * allocating, or using it gives us an illegal block
+			 * number, then just use the start of the next block
+			 * offset by our length.
+			 */
+			if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->alen &&
+			    ISLEGAL(gotbno - gotdiff, gotbno))
+				gotbno -= adjust;
+			else if (ISLEGAL(gotbno - ap->alen, gotbno)) {
+				gotbno -= ap->alen;
+				gotdiff += adjust - ap->alen;
+			} else
+				gotdiff += adjust;
+			/*
+			 * If the firstblock forbids it, can't use it,
+			 * must use default.
+			 */
+			if (!rt && !nullfb &&
+			    XFS_FSB_TO_AGNO(mp, gotbno) != fb_agno)
+				gotbno = NULLFSBLOCK;
+		}
+		/*
+		 * No next block, just default.
+		 */
+		else
+			gotbno = NULLFSBLOCK;
+		/*
+		 * If both valid, pick the better one, else the only good
+		 * one, else ap->rval is already set (to 0 or the inode block).
+		 */
+		if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
+			ap->rval = prevdiff <= gotdiff ? prevbno : gotbno;
+		else if (prevbno != NULLFSBLOCK)
+			ap->rval = prevbno;
+		else if (gotbno != NULLFSBLOCK)
+			ap->rval = gotbno;
+	}
+	/*
+	 * If allowed, use ap->rval; otherwise must use firstblock since
+	 * it's in the right allocation group.
+	 */
+	if (nullfb || rt || XFS_FSB_TO_AGNO(mp, ap->rval) == fb_agno)
+		;
+	else
+		ap->rval = ap->firstblock;
+	/*
+	 * Realtime allocation, done through xfs_rtallocate_extent.
+	 */
+	if (rt) {
+#ifndef __KERNEL__
+		ASSERT(0);
+#else
+		xfs_rtblock_t	rtb;
+
+		atype = ap->rval == 0 ?
+			XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
+		do_div(ap->rval, mp->m_sb.sb_rextsize);
+		rtb = ap->rval;
+		ap->alen = ralen;
+		if ((error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, ap->alen,
+				&ralen, atype, ap->wasdel, prod, &rtb)))
+			return error;
+		if (rtb == NULLFSBLOCK && prod > 1 &&
+		    (error = xfs_rtallocate_extent(ap->tp, ap->rval, 1,
+						   ap->alen, &ralen, atype,
+						   ap->wasdel, 1, &rtb)))
+			return error;
+		ap->rval = rtb;
+		if (ap->rval != NULLFSBLOCK) {
+			ap->rval *= mp->m_sb.sb_rextsize;
+			ralen *= mp->m_sb.sb_rextsize;
+			ap->alen = ralen;
+			ap->ip->i_d.di_nblocks += ralen;
+			xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+			if (ap->wasdel)
+				ap->ip->i_delayed_blks -= ralen;
+			/*
+			 * Adjust the disk quota also. This was reserved
+			 * earlier.
+			 */
+			if (XFS_IS_QUOTA_ON(mp) &&
+			    ap->ip->i_ino != mp->m_sb.sb_uquotino &&
+			    ap->ip->i_ino != mp->m_sb.sb_gquotino)
+				xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
+					ap->wasdel ?
+						XFS_TRANS_DQ_DELRTBCOUNT :
+						XFS_TRANS_DQ_RTBCOUNT,
+					(long)ralen);
+		} else
+			ap->alen = 0;
+#endif	/* __KERNEL__ */
+	}
+	/*
+	 * Normal allocation, done through xfs_alloc_vextent.
+	 */
+	else {
+		xfs_agnumber_t	ag;
+		xfs_alloc_arg_t args;
+		xfs_extlen_t	blen;
+		xfs_extlen_t	delta;
+		int		isaligned;
+		xfs_extlen_t	longest;
+		xfs_extlen_t	need;
+		xfs_extlen_t	nextminlen=0;
+		int		notinit;
+		xfs_perag_t	*pag;
+		xfs_agnumber_t	startag;
+		int		tryagain;
+
+		tryagain = isaligned = 0;
+		args.tp = ap->tp;
+		args.mp = mp;
+		args.fsbno = ap->rval;
+		args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks);
+		blen = 0;
+		if (nullfb) {
+			args.type = XFS_ALLOCTYPE_START_BNO;
+			args.total = ap->total;
+			/*
+			 * Find the longest available space.
+			 * We're going to try for the whole allocation at once.
+			 */
+			startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno);
+			notinit = 0;
+			down_read(&mp->m_peraglock);
+			while (blen < ap->alen) {
+				pag = &mp->m_perag[ag];
+				if (!pag->pagf_init &&
+				    (error = xfs_alloc_pagf_init(mp, args.tp,
+					    ag, XFS_ALLOC_FLAG_TRYLOCK))) {
+					up_read(&mp->m_peraglock);
+					return error;
+				}
+				/*
+				 * See xfs_alloc_fix_freelist...
+				 */
+				if (pag->pagf_init) {
+					need = XFS_MIN_FREELIST_PAG(pag, mp);
+					delta = need > pag->pagf_flcount ?
+						need - pag->pagf_flcount : 0;
+					longest = (pag->pagf_longest > delta) ?
+						(pag->pagf_longest - delta) :
+						(pag->pagf_flcount > 0 ||
+						 pag->pagf_longest > 0);
+					if (blen < longest)
+						blen = longest;
+				} else
+					notinit = 1;
+				if (++ag == mp->m_sb.sb_agcount)
+					ag = 0;
+				if (ag == startag)
+					break;
+			}
+			up_read(&mp->m_peraglock);
+			/*
+			 * Since the above loop did a BUF_TRYLOCK, it is
+			 * possible that there is space for this request.
+			 */
+			if (notinit || blen < ap->minlen)
+				args.minlen = ap->minlen;
+			/*
+			 * If the best seen length is less than the request
+			 * length, use the best as the minimum.
+			 */
+			else if (blen < ap->alen)
+				args.minlen = blen;
+			/*
+			 * Otherwise we've seen an extent as big as alen,
+			 * use that as the minimum.
+			 */
+			else
+				args.minlen = ap->alen;
+		} else if (ap->low) {
+			args.type = XFS_ALLOCTYPE_FIRST_AG;
+			args.total = args.minlen = ap->minlen;
+		} else {
+			args.type = XFS_ALLOCTYPE_NEAR_BNO;
+			args.total = ap->total;
+			args.minlen = ap->minlen;
+		}
+		if (ap->ip->i_d.di_extsize) {
+			args.prod = ap->ip->i_d.di_extsize;
+			if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod)))
+				args.mod = (xfs_extlen_t)(args.prod - args.mod);
+		} else if (mp->m_sb.sb_blocksize >= NBPP) {
+			args.prod = 1;
+			args.mod = 0;
+		} else {
+			args.prod = NBPP >> mp->m_sb.sb_blocklog;
+			if ((args.mod = (xfs_extlen_t)(do_mod(ap->off, args.prod))))
+				args.mod = (xfs_extlen_t)(args.prod - args.mod);
+		}
+		/*
+		 * If we are not low on available data blocks, and the
+		 * underlying logical volume manager is a stripe, and
+		 * the file offset is zero then try to allocate data
+		 * blocks on stripe unit boundary.
+		 * NOTE: ap->aeof is only set if the allocation length
+		 * is >= the stripe unit and the allocation offset is
+		 * at the end of file.
+		 */
+		if (!ap->low && ap->aeof) {
+			if (!ap->off) {
+				args.alignment = mp->m_dalign;
+				atype = args.type;
+				isaligned = 1;
+				/*
+				 * Adjust for alignment
+				 */
+				if (blen > args.alignment && blen <= ap->alen)
+					args.minlen = blen - args.alignment;
+				args.minalignslop = 0;
+			} else {
+				/*
+				 * First try an exact bno allocation.
+				 * If it fails then do a near or start bno
+				 * allocation with alignment turned on.
+				 */
+				atype = args.type;
+				tryagain = 1;
+				args.type = XFS_ALLOCTYPE_THIS_BNO;
+				args.alignment = 1;
+				/*
+				 * Compute the minlen+alignment for the
+				 * next case.  Set slop so that the value
+				 * of minlen+alignment+slop doesn't go up
+				 * between the calls.
+				 */
+				if (blen > mp->m_dalign && blen <= ap->alen)
+					nextminlen = blen - mp->m_dalign;
+				else
+					nextminlen = args.minlen;
+				if (nextminlen + mp->m_dalign > args.minlen + 1)
+					args.minalignslop =
+						nextminlen + mp->m_dalign -
+						args.minlen - 1;
+				else
+					args.minalignslop = 0;
+			}
+		} else {
+			args.alignment = 1;
+			args.minalignslop = 0;
+		}
+		args.minleft = ap->minleft;
+		args.wasdel = ap->wasdel;
+		args.isfl = 0;
+		args.userdata = ap->userdata;
+		if ((error = xfs_alloc_vextent(&args)))
+			return error;
+		if (tryagain && args.fsbno == NULLFSBLOCK) {
+			/*
+			 * Exact allocation failed. Now try with alignment
+			 * turned on.
+			 */
+			args.type = atype;
+			args.fsbno = ap->rval;
+			args.alignment = mp->m_dalign;
+			args.minlen = nextminlen;
+			args.minalignslop = 0;
+			isaligned = 1;
+			if ((error = xfs_alloc_vextent(&args)))
+				return error;
+		}
+		if (isaligned && args.fsbno == NULLFSBLOCK) {
+			/*
+			 * allocation failed, so turn off alignment and
+			 * try again.
+			 */
+			args.type = atype;
+			args.fsbno = ap->rval;
+			args.alignment = 0;
+			if ((error = xfs_alloc_vextent(&args)))
+				return error;
+		}
+		if (args.fsbno == NULLFSBLOCK && nullfb &&
+		    args.minlen > ap->minlen) {
+			args.minlen = ap->minlen;
+			args.type = XFS_ALLOCTYPE_START_BNO;
+			args.fsbno = ap->rval;
+			if ((error = xfs_alloc_vextent(&args)))
+				return error;
+		}
+		if (args.fsbno == NULLFSBLOCK && nullfb) {
+			args.fsbno = 0;
+			args.type = XFS_ALLOCTYPE_FIRST_AG;
+			args.total = ap->minlen;
+			args.minleft = 0;
+			if ((error = xfs_alloc_vextent(&args)))
+				return error;
+			ap->low = 1;
+		}
+		if (args.fsbno != NULLFSBLOCK) {
+			ap->firstblock = ap->rval = args.fsbno;
+			ASSERT(nullfb || fb_agno == args.agno ||
+			       (ap->low && fb_agno < args.agno));
+			ap->alen = args.len;
+			ap->ip->i_d.di_nblocks += args.len;
+			xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+			if (ap->wasdel)
+				ap->ip->i_delayed_blks -= args.len;
+			/*
+			 * Adjust the disk quota also. This was reserved
+			 * earlier.
+			 */
+			if (XFS_IS_QUOTA_ON(mp) &&
+			    ap->ip->i_ino != mp->m_sb.sb_uquotino &&
+			    ap->ip->i_ino != mp->m_sb.sb_gquotino)
+				xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
+					ap->wasdel ?
+						XFS_TRANS_DQ_DELBCOUNT :
+						XFS_TRANS_DQ_BCOUNT,
+					(long)args.len);
+		} else {
+			ap->rval = NULLFSBLOCK;
+			ap->alen = 0;
+		}
+	}
+	return 0;
+#undef	ISLEGAL
+}
+
+/*
+ * Transform a btree format file with only one leaf node, where the
+ * extents list will fit in the inode, into an extents format file.
+ * Since the extent list is already in-core, all we have to do is
+ * give up the space for the btree root and pitch the leaf block.
+ */
+STATIC int				/* error */
+xfs_bmap_btree_to_extents(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork,  /* data or attr fork */
+	int			async)	    /* xaction can be async */
+{
+	/* REFERENCED */
+	xfs_bmbt_block_t	*cblock;/* child btree block */
+	xfs_fsblock_t		cbno;	/* child block number */
+	xfs_buf_t		*cbp;	/* child block's buffer */
+	int			error;	/* error return value */
+	xfs_ifork_t		*ifp;	/* inode fork data */
+	xfs_mount_t		*mp;	/* mount point structure */
+	xfs_bmbt_ptr_t		*pp;	/* ptr to block address */
+	xfs_bmbt_block_t	*rblock;/* root btree block */
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+	rblock = ifp->if_broot;
+	ASSERT(INT_GET(rblock->bb_level, ARCH_CONVERT) == 1);
+	ASSERT(INT_GET(rblock->bb_numrecs, ARCH_CONVERT) == 1);
+	ASSERT(XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes) == 1);
+	mp = ip->i_mount;
+	pp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, ifp->if_broot_bytes);
+	*logflagsp = 0;
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), 1)))
+		return error;
+#endif
+	cbno = INT_GET(*pp, ARCH_CONVERT);
+	if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp,
+			XFS_BMAP_BTREE_REF)))
+		return error;
+	cblock = XFS_BUF_TO_BMBT_BLOCK(cbp);
+	if ((error = xfs_btree_check_lblock(cur, cblock, 0, cbp)))
+		return error;
+	xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
+	if (!async)
+		xfs_trans_set_sync(tp);
+	ip->i_d.di_nblocks--;
+	if (XFS_IS_QUOTA_ON(mp) &&
+	    ip->i_ino != mp->m_sb.sb_uquotino &&
+	    ip->i_ino != mp->m_sb.sb_gquotino)
+		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+	xfs_trans_binval(tp, cbp);
+	if (cur->bc_bufs[0] == cbp)
+		cur->bc_bufs[0] = NULL;
+	xfs_iroot_realloc(ip, -1, whichfork);
+	ASSERT(ifp->if_broot == NULL);
+	ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
+	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+	*logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+	return 0;
+}
+
+/*
+ * Called by xfs_bmapi to update extent list structure and the btree
+ * after removing space (or undoing a delayed allocation).
+ */
+STATIC int				/* error */
+xfs_bmap_del_extent(
+	xfs_inode_t		*ip,	/* incore inode pointer */
+	xfs_trans_t		*tp,	/* current transaction pointer */
+	xfs_extnum_t		idx,	/* extent number to update/delete */
+	xfs_bmap_free_t		*flist, /* list of extents to be freed */
+	xfs_btree_cur_t		*cur,	/* if null, not a btree */
+	xfs_bmbt_irec_t		*del,	/* data to remove from extent list */
+	int			iflags, /* input flags */
+	int			*logflagsp, /* inode logging flags */
+	int			whichfork, /* data or attr fork */
+	int			rsvd)	/* OK to allocate reserved blocks */
+{
+	xfs_filblks_t		da_new; /* new delay-alloc indirect blocks */
+	xfs_filblks_t		da_old; /* old delay-alloc indirect blocks */
+	xfs_fsblock_t		del_endblock=0; /* first block past del */
+	xfs_fileoff_t		del_endoff;	/* first offset past del */
+	int			delay;	/* current block is delayed allocated */
+	int			do_fx;	/* free extent at end of routine */
+	xfs_bmbt_rec_t		*ep;	/* current extent entry pointer */
+	int			error;	/* error return value */
+	int			flags;	/* inode logging flags */
+#ifdef XFS_BMAP_TRACE
+	static char		fname[] = "xfs_bmap_del_extent";
+#endif
+	xfs_bmbt_irec_t		got;	/* current extent entry */
+	xfs_fileoff_t		got_endoff;	/* first offset past got */
+	int			i;	/* temp state */
+	xfs_ifork_t		*ifp;	/* inode fork pointer */
+	xfs_mount_t		*mp;	/* mount structure */
+	xfs_filblks_t		nblks;	/* quota/sb block count */
+	xfs_bmbt_irec_t		new;	/* new record to be inserted */
+	/* REFERENCED */
+	xfs_extnum_t		nextents;	/* number of extents in list */
+	uint			qfield; /* quota field to update */
+	xfs_filblks_t		temp;	/* for indirect length calculations */
+	xfs_filblks_t		temp2;	/* for indirect length calculations */
+
+	XFS_STATS_INC(xfsstats.xs_del_exlist);
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	ASSERT(idx >= 0 && idx < nextents);
+	ASSERT(del->br_blockcount > 0);
+	ep = &ifp->if_u1.if_extents[idx];
+	xfs_bmbt_get_all(ep, &got);
+	ASSERT(got.br_startoff <= del->br_startoff);
+	del_endoff = del->br_startoff + del->br_blockcount;
+	got_endoff = got.br_startoff + got.br_blockcount;
+	ASSERT(got_endoff >= del_endoff);
+	delay = ISNULLSTARTBLOCK(got.br_startblock);
+	ASSERT(ISNULLSTARTBLOCK(del->br_startblock) == delay);
+	flags = 0;
+	qfield = 0;
+	error = 0;
+	/*
+	 * If deleting a real allocation, must free up the disk space.
+	 */
+	if (!delay) {
+		flags = XFS_ILOG_CORE;
+		/*
+		 * Realtime allocation.	 Free it and record di_nblocks update.
+		 */
+		if (whichfork == XFS_DATA_FORK &&
+		    (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
+			xfs_fsblock_t	bno;
+			xfs_filblks_t	len;
+
+			ASSERT(do_mod(del->br_blockcount,
+				      mp->m_sb.sb_rextsize) == 0);
+			ASSERT(do_mod(del->br_startblock,
+				      mp->m_sb.sb_rextsize) == 0);
+			bno = del->br_startblock;
+			len = del->br_blockcount;
+			do_div(bno, mp->m_sb.sb_rextsize);
+			do_div(len, mp->m_sb.sb_rextsize);
+			if ((error = xfs_rtfree_extent(ip->i_transp, bno,
+					(xfs_extlen_t)len)))
+				goto done;
+			do_fx = 0;
+			nblks = len * mp->m_sb.sb_rextsize;
+			if (XFS_IS_QUOTA_ON(mp) &&
+			    ip->i_ino != mp->m_sb.sb_uquotino &&
+			    ip->i_ino != mp->m_sb.sb_gquotino)
+				qfield = XFS_TRANS_DQ_RTBCOUNT;
+		}
+		/*
+		 * Ordinary allocation.
+		 */
+		else {
+			do_fx = 1;
+			nblks = del->br_blockcount;
+			if (XFS_IS_QUOTA_ON(mp) &&
+			    ip->i_ino != mp->m_sb.sb_uquotino &&
+			    ip->i_ino != mp->m_sb.sb_gquotino)
+				qfield = XFS_TRANS_DQ_BCOUNT;
+		}
+		/*
+		 * Set up del_endblock and cur for later.
+		 */
+		del_endblock = del->br_startblock + del->br_blockcount;
+		if (cur) {
+			if ((error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+					got.br_startblock, got.br_blockcount,
+					&i)))
+				goto done;
+			ASSERT(i == 1);
+		}
+		da_old = da_new = 0;
+	} else {
+		da_old = STARTBLOCKVAL(got.br_startblock);
+		da_new = 0;
+		nblks = 0;
+		do_fx = 0;
+	}
+	/*
+	 * Set flag value to use in switch statement.
+	 * Left-contig is 2, right-contig is 1.
+	 */
+	switch (((got.br_startoff == del->br_startoff) << 1) |
+		(got_endoff == del_endoff)) {
+	case 3:
+		/*
+		 * Matches the whole extent.  Delete the entry.
+		 */
+		xfs_bmap_trace_delete(fname, "3", ip, idx, 1, whichfork);
+		xfs_bmap_delete_exlist(ip, idx, 1, whichfork);
+		ifp->if_lastex = idx;
+		if (delay)
+			break;
+		XFS_IFORK_NEXT_SET(ip, whichfork,
+			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+		flags |= XFS_ILOG_CORE;
+		if (!cur) {
+			flags |= XFS_ILOG_FEXT(whichfork);
+			break;
+		}
+		if ((error = xfs_bmbt_delete(cur, iflags & XFS_BMAPI_ASYNC, &i)))
+			goto done;
+		ASSERT(i == 1);
+		break;
+
+	case 2:
+		/*
+		 * Deleting the first part of the extent.
+		 */
+		xfs_bmap_trace_pre_update(fname, "2", ip, idx, whichfork);
+		xfs_bmbt_set_startoff(ep, del_endoff);
+		temp = got.br_blockcount - del->br_blockcount;
+		xfs_bmbt_set_blockcount(ep, temp);
+		ifp->if_lastex = idx;
+		if (delay) {
+			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+				da_old);
+			xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+			xfs_bmap_trace_post_update(fname, "2", ip, idx,
+				whichfork);
+			da_new = temp;
+			break;
+		}
+		xfs_bmbt_set_startblock(ep, del_endblock);
+		xfs_bmap_trace_post_update(fname, "2", ip, idx, whichfork);
+		if (!cur) {
+			flags |= XFS_ILOG_FEXT(whichfork);
+			break;
+		}
+		if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
+				got.br_blockcount - del->br_blockcount,
+				got.br_state)))
+			goto done;
+		break;
+
+	case 1:
+		/*
+		 * Deleting the last part of the extent.
+		 */
+		temp = got.br_blockcount - del->br_blockcount;
+		xfs_bmap_trace_pre_update(fname, "1", ip, idx, whichfork);
+		xfs_bmbt_set_blockcount(ep, temp);
+		ifp->if_lastex = idx;
+		if (delay) {
+			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+				da_old);
+			xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+			xfs_bmap_trace_post_update(fname, "1", ip, idx,
+				whichfork);
+			da_new = temp;
+			break;
+		}
+		xfs_bmap_trace_post_update(fname, "1", ip, idx, whichfork);
+		if (!cur) {
+			flags |= XFS_ILOG_FEXT(whichfork);
+			break;
+		}
+		if ((error = xfs_bmbt_update(cur, got.br_startoff,
+				got.br_startblock,
+				got.br_blockcount - del->br_blockcount,
+				got.br_state)))
+			goto done;
+		break;
+
+	case 0:
+		/*
+		 * Deleting the middle of the extent.
+		 */
+		temp = del->br_startoff - got.br_startoff;
+		xfs_bmap_trace_pre_update(fname, "0", ip, idx, whichfork);
+		xfs_bmbt_set_blockcount(ep, temp);
+		new.br_startoff = del_endoff;
+		temp2 = got_endoff - del_endoff;
+		new.br_blockcount = temp2;
+		new.br_state = got.br_state;
+		if (!delay) {
+			new.br_startblock = del_endblock;
+			flags |= XFS_ILOG_CORE;
+			if (cur) {
+				if ((error = xfs_bmbt_update(cur,
+						got.br_startoff,
+						got.br_startblock, temp,
+						got.br_state)))
+					goto done;
+				if ((error = xfs_bmbt_increment(cur, 0, &i)))
+					goto done;
+				cur->bc_rec.b = new;
+				error = xfs_bmbt_insert(cur, &i);
+				if (error && error != ENOSPC)
+					goto done;
+				/*
+				 * If get no-space back from btree insert,
+				 * it tried a split, and we have a zero
+				 * block reservation.
+				 * Fix up our state and return the error.
+				 */
+				if (error == ENOSPC) {
+					/*
+					 * Reset the cursor, don't trust
+					 * it after any insert operation.
+					 */
+					if ((error = xfs_bmbt_lookup_eq(cur,
+							got.br_startoff,
+							got.br_startblock,
+							temp, &i)))
+						goto done;
+					ASSERT(i == 1);
+					/*
+					 * Update the btree record back
+					 * to the original value.
+					 */
+					if ((error = xfs_bmbt_update(cur,
+							got.br_startoff,
+							got.br_startblock,
+							got.br_blockcount,
+							got.br_state)))
+						goto done;
+					/*
+					 * Reset the extent record back
+					 * to the original value.
+					 */
+					xfs_bmbt_set_blockcount(ep,
+						got.br_blockcount);
+					flags = 0;
+					error = XFS_ERROR(ENOSPC);
+					goto done;
+				}
+				ASSERT(i == 1);
+			} else
+				flags |= XFS_ILOG_FEXT(whichfork);
+			XFS_IFORK_NEXT_SET(ip, whichfork,
+				XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+		} else {
+			ASSERT(whichfork == XFS_DATA_FORK);
+			temp = xfs_bmap_worst_indlen(ip, temp);
+			xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+			temp2 = xfs_bmap_worst_indlen(ip, temp2);
+			new.br_startblock = NULLSTARTBLOCK((int)temp2);
+			da_new = temp + temp2;
+			while (da_new > da_old) {
+				if (temp) {
+					temp--;
+					da_new--;
+					xfs_bmbt_set_startblock(ep,
+						NULLSTARTBLOCK((int)temp));
+				}
+				if (da_new == da_old)
+					break;
+				if (temp2) {
+					temp2--;
+					da_new--;
+					new.br_startblock =
+						NULLSTARTBLOCK((int)temp2);
+				}
+			}
+		}
+		xfs_bmap_trace_post_update(fname, "0", ip, idx, whichfork);
+		xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 1, &new, NULL,
+			whichfork);
+		xfs_bmap_insert_exlist(ip, idx + 1, 1, &new, whichfork);
+		ifp->if_lastex = idx + 1;
+		break;
+	}
+	/*
+	 * If we need to, add to list of extents to delete.
+	 */
+	if (do_fx)
+		xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist,
+			mp);
+	/*
+	 * Adjust inode # blocks in the file.
+	 */
+	if (nblks)
+		ip->i_d.di_nblocks -= nblks;
+	/*
+	 * Adjust quota data.
+	 */
+	if (qfield)
+		xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
+	/*
+	 * Account for change in delayed indirect blocks.
+	 * Nothing to do for disk quota accounting here.
+	 */
+	ASSERT(da_old >= da_new);
+	if (da_old > da_new)
+		xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int)(da_old - da_new),
+			rsvd);
+done:
+	*logflagsp = flags;
+	return error;
+}
+
+/*
+ * Remove the entry "free" from the free item list.  Prev points to the
+ * previous entry, unless "free" is the head of the list.
+ */
+STATIC void
+xfs_bmap_del_free(
+	xfs_bmap_free_t		*flist, /* free item list header */
+	xfs_bmap_free_item_t	*prev,	/* previous item on list, if any */
+	xfs_bmap_free_item_t	*free)	/* list item to be freed */
+{
+	if (prev)
+		prev->xbfi_next = free->xbfi_next;
+	else
+		flist->xbf_first = free->xbfi_next;
+	flist->xbf_count--;
+	kmem_zone_free(xfs_bmap_free_item_zone, free);
+}
+
+/*
+ * Remove count entries from the extents array for inode "ip", starting
+ * at index "idx".  Copies the remaining items down over the deleted ones,
+ * and gives back the excess memory.
+ */
+STATIC void
+xfs_bmap_delete_exlist(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* starting delete index */
+	xfs_extnum_t	count,		/* count of items to delete */
+	int		whichfork)	/* data or attr fork */
+{
+	xfs_bmbt_rec_t	*base;		/* base of extent list */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_extnum_t	nextents;	/* number of extents in list after */
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	base = ifp->if_u1.if_extents;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - count;
+	ovbcopy(&base[idx + count], &base[idx],
+		(nextents - idx) * sizeof(*base));
+	xfs_iext_realloc(ip, -count, whichfork);
+}
+
+/*
+ * Convert an extents-format file into a btree-format file.
+ * The new file will have a root block (in the inode) and a single child block.
+ */
+STATIC int					/* error */
+xfs_bmap_extents_to_btree(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	xfs_fsblock_t		*firstblock,	/* first-block-allocated */
+	xfs_bmap_free_t		*flist,		/* blocks freed in xaction */
+	xfs_btree_cur_t		**curp,		/* cursor returned to caller */
+	int			wasdel,		/* converting a delayed alloc */
+	int			*logflagsp,	/* inode logging flags */
+	int			whichfork)	/* data or attr fork */
+{
+	xfs_bmbt_block_t	*ablock;	/* allocated (child) bt block */
+	xfs_buf_t		*abp;		/* buffer for ablock */
+	xfs_alloc_arg_t		args;		/* allocation arguments */
+	xfs_bmbt_rec_t		*arp;		/* child record pointer */
+	xfs_bmbt_block_t	*block;		/* btree root block */
+	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
+	xfs_bmbt_rec_t		*ep;		/* extent list pointer */
+	int			error;		/* error return value */
+	xfs_extnum_t		i;		/* extent list index */
+	xfs_ifork_t		*ifp;		/* inode fork pointer */
+	xfs_bmbt_key_t		*kp;		/* root block key pointer */
+	xfs_mount_t		*mp;		/* mount structure */
+	xfs_extnum_t		nextents;	/* extent list size */
+	xfs_bmbt_ptr_t		*pp;		/* root block address pointer */
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+	/*
+	 * Make space in the inode incore.
+	 */
+	xfs_iroot_realloc(ip, 1, whichfork);
+	ifp->if_flags |= XFS_IFBROOT;
+	/*
+	 * Fill in the root.
+	 */
+	block = ifp->if_broot;
+	INT_SET(block->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC);
+	INT_SET(block->bb_level, ARCH_CONVERT, 1);
+	INT_SET(block->bb_numrecs, ARCH_CONVERT, 1);
+	INT_SET(block->bb_leftsib, ARCH_CONVERT, NULLDFSBNO);
+	INT_SET(block->bb_rightsib, ARCH_CONVERT, NULLDFSBNO);
+	/*
+	 * Need a cursor.  Can't allocate until bb_level is filled in.
+	 */
+	mp = ip->i_mount;
+	cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
+		whichfork);
+	cur->bc_private.b.firstblock = *firstblock;
+	cur->bc_private.b.flist = flist;
+	cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
+	/*
+	 * Convert to a btree with two levels, one record in root.
+	 */
+	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
+	args.tp = tp;
+	args.mp = mp;
+	if (*firstblock == NULLFSBLOCK) {
+		args.type = XFS_ALLOCTYPE_START_BNO;
+		args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
+	} else if (flist->xbf_low) {
+		args.type = XFS_ALLOCTYPE_START_BNO;
+		args.fsbno = *firstblock;
+	} else {
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		args.fsbno = *firstblock;
+	}
+	args.minlen = args.maxlen = args.prod = 1;
+	args.total = args.minleft = args.alignment = args.mod = args.isfl =
+		args.minalignslop = 0;
+	args.wasdel = wasdel;
+	*logflagsp = 0;
+	if ((error = xfs_alloc_vextent(&args))) {
+		xfs_iroot_realloc(ip, -1, whichfork);
+		xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+		return error;
+	}
+	/*
+	 * Allocation can't fail, the space was reserved.
+	 */
+	ASSERT(args.fsbno != NULLFSBLOCK);
+	ASSERT(*firstblock == NULLFSBLOCK ||
+	       args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
+	       (flist->xbf_low &&
+		args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
+	*firstblock = cur->bc_private.b.firstblock = args.fsbno;
+	cur->bc_private.b.allocated++;
+	ip->i_d.di_nblocks++;
+	if (XFS_IS_QUOTA_ON(mp) &&
+	    ip->i_ino != mp->m_sb.sb_uquotino &&
+	    ip->i_ino != mp->m_sb.sb_gquotino)
+		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
+	abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
+	/*
+	 * Fill in the child block.
+	 */
+	ablock = XFS_BUF_TO_BMBT_BLOCK(abp);
+	INT_SET(ablock->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC);
+	INT_ZERO(ablock->bb_level, ARCH_CONVERT);
+	INT_ZERO(ablock->bb_numrecs, ARCH_CONVERT);
+	INT_SET(ablock->bb_leftsib, ARCH_CONVERT, NULLDFSBNO);
+	INT_SET(ablock->bb_rightsib, ARCH_CONVERT, NULLDFSBNO);
+	arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	for (ep = ifp->if_u1.if_extents, i = 0; i < nextents; i++, ep++) {
+		if (!ISNULLSTARTBLOCK(xfs_bmbt_get_startblock(ep))) {
+			*arp++ = *ep;
+			INT_MOD(ablock->bb_numrecs, ARCH_CONVERT, +1);
+		}
+	}
+	ASSERT(INT_GET(ablock->bb_numrecs, ARCH_CONVERT) == XFS_IFORK_NEXTENTS(ip, whichfork));
+	/*
+	 * Fill in the root key and pointer.
+	 */
+	kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
+	arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
+	INT_SET(kp->br_startoff, ARCH_CONVERT, xfs_bmbt_get_startoff(arp));
+	pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
+	INT_SET(*pp, ARCH_CONVERT, args.fsbno);
+	/*
+	 * Do all this logging at the end so that
+	 * the root is at the right level.
+	 */
+	xfs_bmbt_log_block(cur, abp, XFS_BB_ALL_BITS);
+	xfs_bmbt_log_recs(cur, abp, 1, INT_GET(ablock->bb_numrecs, ARCH_CONVERT));
+	ASSERT(*curp == NULL);
+	*curp = cur;
+	*logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork);
+	return 0;
+}
+
+/*
+ * Insert new item(s) in the extent list for inode "ip".
+ * Count new items are inserted at offset idx.
+ */
+STATIC void
+xfs_bmap_insert_exlist(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* starting index of new items */
+	xfs_extnum_t	count,		/* number of inserted items */
+	xfs_bmbt_irec_t *new,		/* items to insert */
+	int		whichfork)	/* data or attr fork */
+{
+	xfs_bmbt_rec_t	*base;		/* extent list base */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_extnum_t	nextents;	/* extent list size */
+	xfs_extnum_t	to;		/* extent list index */
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	xfs_iext_realloc(ip, count, whichfork);
+	base = ifp->if_u1.if_extents;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	ovbcopy(&base[idx], &base[idx + count],
+		(nextents - (idx + count)) * sizeof(*base));
+	for (to = idx; to < idx + count; to++, new++)
+		xfs_bmbt_set_all(&base[to], new);
+}
+
+/*
+ * Convert a local file to an extents file.
+ * This code is out of bounds for data forks of regular files,
+ * since the file data needs to get logged so things will stay consistent.
+ * (The bmap-level manipulations are ok, though).
+ */
+STATIC int				/* error */
+xfs_bmap_local_to_extents(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_fsblock_t	*firstblock,	/* first block allocated in xaction */
+	xfs_extlen_t	total,		/* total blocks needed by transaction */
+	int		*logflagsp,	/* inode logging flags */
+	int		whichfork)	/* data or attr fork */
+{
+	int		error;		/* error return value */
+	int		flags;		/* logging flags returned */
+#ifdef XFS_BMAP_TRACE
+	static char	fname[] = "xfs_bmap_local_to_extents";
+#endif
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+
+	/*
+	 * We don't want to deal with the case of keeping inode data inline yet.
+	 * So sending the data fork of a regular inode is illegal.
+	 */
+	ASSERT(!((ip->i_d.di_mode & IFMT) == IFREG &&
+		 whichfork == XFS_DATA_FORK));
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+	flags = 0;
+	error = 0;
+	if (ifp->if_bytes) {
+		xfs_alloc_arg_t args;	/* allocation arguments */
+		xfs_buf_t	*bp;	/* buffer for extent list block */
+		xfs_bmbt_rec_t	*ep;	/* extent list pointer */
+
+		args.tp = tp;
+		args.mp = ip->i_mount;
+		ASSERT(ifp->if_flags & XFS_IFINLINE);
+		/*
+		 * Allocate a block.  We know we need only one, since the
+		 * file currently fits in an inode.
+		 */
+		if (*firstblock == NULLFSBLOCK) {
+			args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
+			args.type = XFS_ALLOCTYPE_START_BNO;
+		} else {
+			args.fsbno = *firstblock;
+			args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		}
+		args.total = total;
+		args.mod = args.minleft = args.alignment = args.wasdel =
+			args.isfl = args.minalignslop = 0;
+		args.minlen = args.maxlen = args.prod = 1;
+		if ((error = xfs_alloc_vextent(&args)))
+			goto done;
+		/*
+		 * Can't fail, the space was reserved.
+		 */
+		ASSERT(args.fsbno != NULLFSBLOCK);
+		ASSERT(args.len == 1);
+		*firstblock = args.fsbno;
+		bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+		bcopy(ifp->if_u1.if_data, (char *)XFS_BUF_PTR(bp),
+			ifp->if_bytes);
+		xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+		xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
+		xfs_iext_realloc(ip, 1, whichfork);
+		ep = ifp->if_u1.if_extents;
+		xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM);
+		xfs_bmap_trace_post_update(fname, "new", ip, 0, whichfork);
+		XFS_IFORK_NEXT_SET(ip, whichfork, 1);
+		ip->i_d.di_nblocks = 1;
+		if (XFS_IS_QUOTA_ON(args.mp) &&
+		    ip->i_ino != args.mp->m_sb.sb_uquotino &&
+		    ip->i_ino != args.mp->m_sb.sb_gquotino)
+			xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
+				1L);
+		flags |= XFS_ILOG_FEXT(whichfork);
+	} else
+		ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
+	ifp->if_flags &= ~XFS_IFINLINE;
+	ifp->if_flags |= XFS_IFEXTENTS;
+	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+	flags |= XFS_ILOG_CORE;
+done:
+	*logflagsp = flags;
+	return error;
+}
+
+xfs_bmbt_rec_t *			/* pointer to found extent entry */
+xfs_bmap_do_search_extents(
+	xfs_bmbt_rec_t	*base,		/* base of extent list */
+	xfs_extnum_t	lastx,		/* last extent index used */
+	xfs_extnum_t	nextents,	/* extent list size */
+	xfs_fileoff_t	bno,		/* block number searched for */
+	int		*eofp,		/* out: end of file found */
+	xfs_extnum_t	*lastxp,	/* out: last extent index */
+	xfs_bmbt_irec_t *gotp,		/* out: extent entry found */
+	xfs_bmbt_irec_t *prevp)		/* out: previous extent entry found */
+{
+	xfs_bmbt_rec_t	*ep;		/* extent list entry pointer */
+	xfs_bmbt_irec_t got;		/* extent list entry, decoded */
+	int		high;		/* high index of binary search */
+	int		low;		/* low index of binary search */
+
+	if (lastx != NULLEXTNUM && lastx < nextents)
+		ep = base + lastx;
+	else
+		ep = NULL;
+	prevp->br_startoff = NULLFILEOFF;
+	if (ep && bno >= (got.br_startoff = xfs_bmbt_get_startoff(ep)) &&
+	    bno < got.br_startoff +
+		  (got.br_blockcount = xfs_bmbt_get_blockcount(ep)))
+		*eofp = 0;
+	else if (ep && lastx < nextents - 1 &&
+		 bno >= (got.br_startoff = xfs_bmbt_get_startoff(ep + 1)) &&
+		 bno < got.br_startoff +
+		       (got.br_blockcount = xfs_bmbt_get_blockcount(ep + 1))) {
+		lastx++;
+		ep++;
+		*eofp = 0;
+	} else if (nextents == 0)
+		*eofp = 1;
+	else if (bno == 0 &&
+		 (got.br_startoff = xfs_bmbt_get_startoff(base)) == 0) {
+		ep = base;
+		lastx = 0;
+		got.br_blockcount = xfs_bmbt_get_blockcount(ep);
+		*eofp = 0;
+	} else {
+		/* binary search the extents array */
+		low = 0;
+		high = nextents - 1;
+		while (low <= high) {
+			XFS_STATS_INC(xfsstats.xs_cmp_exlist);
+			lastx = (low + high) >> 1;
+			ep = base + lastx;
+			got.br_startoff = xfs_bmbt_get_startoff(ep);
+			got.br_blockcount = xfs_bmbt_get_blockcount(ep);
+			if (bno < got.br_startoff)
+				high = lastx - 1;
+			else if (bno >= got.br_startoff + got.br_blockcount)
+				low = lastx + 1;
+			else {
+				got.br_startblock = xfs_bmbt_get_startblock(ep);
+				got.br_state = xfs_bmbt_get_state(ep);
+				*eofp = 0;
+				*lastxp = lastx;
+				*gotp = got;
+				return ep;
+			}
+		}
+		if (bno >= got.br_startoff + got.br_blockcount) {
+			lastx++;
+			if (lastx == nextents) {
+				*eofp = 1;
+				got.br_startblock = xfs_bmbt_get_startblock(ep);
+				got.br_state = xfs_bmbt_get_state(ep);
+				*prevp = got;
+				ep = NULL;
+			} else {
+				*eofp = 0;
+				xfs_bmbt_get_all(ep, prevp);
+				ep++;
+				got.br_startoff = xfs_bmbt_get_startoff(ep);
+				got.br_blockcount = xfs_bmbt_get_blockcount(ep);
+			}
+		} else {
+			*eofp = 0;
+			if (ep > base)
+				xfs_bmbt_get_all(ep - 1, prevp);
+		}
+	}
+	if (ep) {
+		got.br_startblock = xfs_bmbt_get_startblock(ep);
+		got.br_state = xfs_bmbt_get_state(ep);
+	}
+	*lastxp = lastx;
+	*gotp = got;
+	return ep;
+}
+
+/*
+ * Search the extents list for the inode, for the extent containing bno.
+ * If bno lies in a hole, point to the next entry.  If bno lies past eof,
+ * *eofp will be set, and *prevp will contain the last entry (null if none).
+ * Else, *lastxp will be set to the index of the found
+ * entry; *gotp will contain the entry.
+ */
+STATIC xfs_bmbt_rec_t *			/* pointer to found extent entry */
+xfs_bmap_search_extents(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_fileoff_t	bno,		/* block number searched for */
+	int		whichfork,	/* data or attr fork */
+	int		*eofp,		/* out: end of file found */
+	xfs_extnum_t	*lastxp,	/* out: last extent index */
+	xfs_bmbt_irec_t *gotp,		/* out: extent entry found */
+	xfs_bmbt_irec_t *prevp)		/* out: previous extent entry found */
+{
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_bmbt_rec_t	*base;		/* base of extent list */
+	xfs_extnum_t	lastx;		/* last extent index used */
+	xfs_extnum_t	nextents;	/* extent list size */
+
+	XFS_STATS_INC(xfsstats.xs_look_exlist);
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	lastx = ifp->if_lastex;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	base = &ifp->if_u1.if_extents[0];
+
+	return xfs_bmap_do_search_extents(base, lastx, nextents, bno, eofp,
+					  lastxp, gotp, prevp);
+}
+
+
+#ifdef XFS_BMAP_TRACE
+/*
+ * Add a bmap trace buffer entry.  Base routine for the others.
+ */
+STATIC void
+xfs_bmap_trace_addentry(
+	int		opcode,		/* operation */
+	char		*fname,		/* function name */
+	char		*desc,		/* operation description */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* index of entry(ies) */
+	xfs_extnum_t	cnt,		/* count of entries, 1 or 2 */
+	xfs_bmbt_rec_t	*r1,		/* first record */
+	xfs_bmbt_rec_t	*r2,		/* second record or null */
+	int		whichfork)	/* data or attr fork */
+{
+	xfs_bmbt_rec_t	tr2;
+
+	ASSERT(cnt == 1 || cnt == 2);
+	ASSERT(r1 != NULL);
+	if (cnt == 1) {
+		ASSERT(r2 == NULL);
+		r2 = &tr2;
+		bzero(&tr2, sizeof(tr2));
+	} else
+		ASSERT(r2 != NULL);
+	ktrace_enter(xfs_bmap_trace_buf,
+		(void *)(__psint_t)(opcode | (whichfork << 16)),
+		(void *)fname, (void *)desc, (void *)ip,
+		(void *)(__psint_t)idx,
+		(void *)(__psint_t)cnt,
+		(void *)(__psunsigned_t)(ip->i_ino >> 32),
+		(void *)(__psunsigned_t)(unsigned)ip->i_ino,
+		(void *)(__psunsigned_t)(INT_GET(r1->l0, ARCH_CONVERT) >> 32),
+		(void *)(__psunsigned_t)(unsigned)(INT_GET(r1->l0, ARCH_CONVERT)),
+		(void *)(__psunsigned_t)(INT_GET(r1->l1, ARCH_CONVERT) >> 32),
+		(void *)(__psunsigned_t)(unsigned)(INT_GET(r1->l1, ARCH_CONVERT)),
+		(void *)(__psunsigned_t)(INT_GET(r2->l0, ARCH_CONVERT) >> 32),
+		(void *)(__psunsigned_t)(unsigned)(INT_GET(r2->l0, ARCH_CONVERT)),
+		(void *)(__psunsigned_t)(INT_GET(r2->l1, ARCH_CONVERT) >> 32),
+		(void *)(__psunsigned_t)(unsigned)(INT_GET(r2->l1, ARCH_CONVERT))
+		);
+	ASSERT(ip->i_xtrace);
+	ktrace_enter(ip->i_xtrace,
+		(void *)(__psint_t)(opcode | (whichfork << 16)),
+		(void *)fname, (void *)desc, (void *)ip,
+		(void *)(__psint_t)idx,
+		(void *)(__psint_t)cnt,
+		(void *)(__psunsigned_t)(ip->i_ino >> 32),
+		(void *)(__psunsigned_t)(unsigned)ip->i_ino,
+		(void *)(__psunsigned_t)(INT_GET(r1->l0, ARCH_CONVERT) >> 32),
+		(void *)(__psunsigned_t)(unsigned)(INT_GET(r1->l0, ARCH_CONVERT)),
+		(void *)(__psunsigned_t)(INT_GET(r1->l1, ARCH_CONVERT) >> 32),
+		(void *)(__psunsigned_t)(unsigned)(INT_GET(r1->l1, ARCH_CONVERT)),
+		(void *)(__psunsigned_t)(INT_GET(r2->l0, ARCH_CONVERT) >> 32),
+		(void *)(__psunsigned_t)(unsigned)(INT_GET(r2->l0, ARCH_CONVERT)),
+		(void *)(__psunsigned_t)(INT_GET(r2->l1, ARCH_CONVERT) >> 32),
+		(void *)(__psunsigned_t)(unsigned)(INT_GET(r2->l1, ARCH_CONVERT))
+		);
+}
+
+/*
+ * Add bmap trace entry prior to a call to xfs_bmap_delete_exlist.
+ */
+STATIC void
+xfs_bmap_trace_delete(
+	char		*fname,		/* function name */
+	char		*desc,		/* operation description */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* index of entry(entries) deleted */
+	xfs_extnum_t	cnt,		/* count of entries deleted, 1 or 2 */
+	int		whichfork)	/* data or attr fork */
+{
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_DELETE, fname, desc, ip, idx,
+		cnt, &ifp->if_u1.if_extents[idx],
+		cnt == 2 ? &ifp->if_u1.if_extents[idx + 1] : NULL,
+		whichfork);
+}
+
+/*
+ * Add bmap trace entry prior to a call to xfs_bmap_insert_exlist, or
+ * reading in the extents list from the disk (in the btree).
+ */
+STATIC void
+xfs_bmap_trace_insert(
+	char		*fname,		/* function name */
+	char		*desc,		/* operation description */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* index of entry(entries) inserted */
+	xfs_extnum_t	cnt,		/* count of entries inserted, 1 or 2 */
+	xfs_bmbt_irec_t *r1,		/* inserted record 1 */
+	xfs_bmbt_irec_t *r2,		/* inserted record 2 or null */
+	int		whichfork)	/* data or attr fork */
+{
+	xfs_bmbt_rec_t	tr1;		/* compressed record 1 */
+	xfs_bmbt_rec_t	tr2;		/* compressed record 2 if needed */
+
+	xfs_bmbt_set_all(&tr1, r1);
+	if (cnt == 2) {
+		ASSERT(r2 != NULL);
+		xfs_bmbt_set_all(&tr2, r2);
+	} else {
+		ASSERT(cnt == 1);
+		ASSERT(r2 == NULL);
+	}
+	xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_INSERT, fname, desc, ip, idx,
+		cnt, &tr1, cnt == 2 ? &tr2 : NULL, whichfork);
+}
+
+/*
+ * Add bmap trace entry after updating an extent list entry in place.
+ */
+STATIC void
+xfs_bmap_trace_post_update(
+	char		*fname,		/* function name */
+	char		*desc,		/* operation description */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* index of entry updated */
+	int		whichfork)	/* data or attr fork */
+{
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_POST_UP, fname, desc, ip, idx,
+		1, &ifp->if_u1.if_extents[idx], NULL, whichfork);
+}
+
+/*
+ * Add bmap trace entry prior to updating an extent list entry in place.
+ */
+STATIC void
+xfs_bmap_trace_pre_update(
+	char		*fname,		/* function name */
+	char		*desc,		/* operation description */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	idx,		/* index of entry to be updated */
+	int		whichfork)	/* data or attr fork */
+{
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_PRE_UP, fname, desc, ip, idx, 1,
+		&ifp->if_u1.if_extents[idx], NULL, whichfork);
+}
+#endif	/* XFS_BMAP_TRACE */
+
+/*
+ * Compute the worst-case number of indirect blocks that will be used
+ * for ip's delayed extent of length "len".
+ */
+STATIC xfs_filblks_t
+xfs_bmap_worst_indlen(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_filblks_t	len)		/* delayed extent length */
+{
+	int		level;		/* btree level number */
+	int		maxrecs;	/* maximum record count at this level */
+	xfs_mount_t	*mp;		/* mount structure */
+	xfs_filblks_t	rval;		/* return value */
+
+	mp = ip->i_mount;
+	maxrecs = mp->m_bmap_dmxr[0];
+	for (level = 0, rval = 0;
+	     level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
+	     level++) {
+		len += maxrecs - 1;
+		do_div(len, maxrecs);
+		rval += len;
+		if (len == 1)
+			return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
+				level - 1;
+		if (level == 0)
+			maxrecs = mp->m_bmap_dmxr[1];
+	}
+	return rval;
+}
+
+#if defined(DEBUG) && defined(XFS_RW_TRACE)
+STATIC void
+xfs_bunmap_trace(
+	xfs_inode_t		*ip,
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	int			flags,
+	inst_t			*ra)
+{
+	if (ip->i_rwtrace == NULL)
+		return;
+	ktrace_enter(ip->i_rwtrace,
+		(void *)(__psint_t)XFS_BUNMAPI,
+		(void *)ip,
+		(void *)(__psint_t)((ip->i_d.di_size >> 32) & 0xffffffff),
+		(void *)(__psint_t)(ip->i_d.di_size & 0xffffffff),
+		(void *)(__psint_t)(((xfs_dfiloff_t)bno >> 32) & 0xffffffff),
+		(void *)(__psint_t)((xfs_dfiloff_t)bno & 0xffffffff),
+		(void *)(__psint_t)len,
+		(void *)(__psint_t)flags,
+		(void *)(__psint_t)private.p_cpuid,
+		(void *)ra,
+		(void *)0,
+		(void *)0,
+		(void *)0,
+		(void *)0,
+		(void *)0,
+		(void *)0);
+}
+#endif
+
+/*
+ * Convert inode from non-attributed to attributed.
+ * Must not be in a transaction, ip must not be locked.
+ */
+int						/* error code */
+xfs_bmap_add_attrfork(
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	int			rsvd)		/* OK to allocated reserved blocks in trans */
+{
+	int			blks;		/* space reservation */
+	int			committed;	/* xaction was committed */
+	int			error;		/* error return value */
+	xfs_fsblock_t		firstblock;	/* 1st block/ag allocated */
+	xfs_bmap_free_t		flist;		/* freed extent list */
+	int			logflags;	/* logging flags */
+	xfs_mount_t		*mp;		/* mount structure */
+	unsigned long		s;		/* spinlock spl value */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	ASSERT(ip->i_df.if_ext_max ==
+	       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
+	if (XFS_IFORK_Q(ip))
+		return 0;
+	mp = ip->i_mount;
+	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
+	tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK);
+	blks = XFS_ADDAFORK_SPACE_RES(mp);
+	if (rsvd)
+		tp->t_flags |= XFS_TRANS_RESERVE;
+	if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT)))
+		goto error0;
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if (rsvd) {
+			error = xfs_trans_reserve_blkquota_force(tp, ip, blks);
+		} else {
+			error = xfs_trans_reserve_blkquota(tp, ip, blks);
+		}
+
+		if (error) {
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+			xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
+			return error;
+		}
+	}
+	if (XFS_IFORK_Q(ip))
+		goto error1;
+	if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
+		/*
+		 * For inodes coming from pre-6.2 filesystems.
+		 */
+		ASSERT(ip->i_d.di_aformat == 0);
+		ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+	}
+	ASSERT(ip->i_d.di_anextents == 0);
+	VN_HOLD(XFS_ITOV(ip));
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	switch (ip->i_d.di_format) {
+	case XFS_DINODE_FMT_DEV:
+		ip->i_d.di_forkoff = roundup(sizeof(dev_t), 8) >> 3;
+		break;
+	case XFS_DINODE_FMT_UUID:
+		ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3;
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+	case XFS_DINODE_FMT_EXTENTS:
+	case XFS_DINODE_FMT_BTREE:
+		ip->i_d.di_forkoff = mp->m_attroffset >> 3;
+		break;
+	default:
+		ASSERT(0);
+		error = XFS_ERROR(EINVAL);
+		goto error1;
+	}
+	ip->i_df.if_ext_max =
+		XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+	ASSERT(ip->i_afp == NULL);
+	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+	ip->i_afp->if_ext_max =
+		XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+	ip->i_afp->if_flags = XFS_IFEXTENTS;
+	logflags = 0;
+	XFS_BMAP_INIT(&flist, &firstblock);
+	switch (ip->i_d.di_format) {
+	case XFS_DINODE_FMT_LOCAL:
+		error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
+			&logflags);
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+		error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock,
+			&flist, &logflags);
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist,
+			&logflags);
+		break;
+	default:
+		error = 0;
+		break;
+	}
+	if (logflags)
+		xfs_trans_log_inode(tp, ip, logflags);
+	if (error)
+		goto error2;
+	if (!XFS_SB_VERSION_HASATTR(&mp->m_sb)) {
+		s = XFS_SB_LOCK(mp);
+		if (!XFS_SB_VERSION_HASATTR(&mp->m_sb)) {
+			XFS_SB_VERSION_ADDATTR(&mp->m_sb);
+			XFS_SB_UNLOCK(mp, s);
+			xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
+		} else
+			XFS_SB_UNLOCK(mp, s);
+	}
+	if ((error = xfs_bmap_finish(&tp, &flist, firstblock, &committed)))
+		goto error2;
+	error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES, NULL);
+	ASSERT(ip->i_df.if_ext_max ==
+	       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
+	return error;
+error2:
+	xfs_bmap_cancel(&flist);
+error1:
+	ASSERT(ismrlocked(&ip->i_lock,MR_UPDATE));
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+error0:
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+	ASSERT(ip->i_df.if_ext_max ==
+	       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
+	return error;
+}
+
+/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+/* ARGSUSED */
+void
+xfs_bmap_add_free(
+	xfs_fsblock_t		bno,		/* fs block number of extent */
+	xfs_filblks_t		len,		/* length of extent */
+	xfs_bmap_free_t		*flist,		/* list of extents */
+	xfs_mount_t		*mp)		/* mount point structure */
+{
+	xfs_bmap_free_item_t	*cur;		/* current (next) element */
+	xfs_bmap_free_item_t	*new;		/* new element */
+	xfs_bmap_free_item_t	*prev;		/* previous element */
+#ifdef DEBUG
+	xfs_agnumber_t		agno;
+	xfs_agblock_t		agbno;
+
+	ASSERT(bno != NULLFSBLOCK);
+	ASSERT(len > 0);
+	ASSERT(len <= MAXEXTLEN);
+	ASSERT(!ISNULLSTARTBLOCK(bno));
+	agno = XFS_FSB_TO_AGNO(mp, bno);
+	agbno = XFS_FSB_TO_AGBNO(mp, bno);
+	ASSERT(agno < mp->m_sb.sb_agcount);
+	ASSERT(agbno < mp->m_sb.sb_agblocks);
+	ASSERT(len < mp->m_sb.sb_agblocks);
+	ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
+#endif
+	ASSERT(xfs_bmap_free_item_zone != NULL);
+	new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+	new->xbfi_startblock = bno;
+	new->xbfi_blockcount = (xfs_extlen_t)len;
+	for (prev = NULL, cur = flist->xbf_first;
+	     cur != NULL;
+	     prev = cur, cur = cur->xbfi_next) {
+		if (cur->xbfi_startblock >= bno)
+			break;
+	}
+	if (prev)
+		prev->xbfi_next = new;
+	else
+		flist->xbf_first = new;
+	new->xbfi_next = cur;
+	flist->xbf_count++;
+}
+
+/*
+ * Compute and fill in the value of the maximum depth of a bmap btree
+ * in this filesystem.	Done once, during mount.
+ */
+void
+xfs_bmap_compute_maxlevels(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	int		whichfork)	/* data or attr fork */
+{
+	int		level;		/* btree level */
+	uint		maxblocks;	/* max blocks at this level */
+	uint		maxleafents;	/* max leaf entries possible */
+	int		maxrootrecs;	/* max records in root block */
+	int		minleafrecs;	/* min records in leaf block */
+	int		minnoderecs;	/* min records in node block */
+	int		sz;		/* root block size */
+
+	/*
+	 * The maximum number of extents in a file, hence the maximum
+	 * number of leaf entries, is controlled by the type of di_nextents
+	 * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
+	 * (a signed 16-bit number, xfs_aextnum_t).
+	 */
+	maxleafents = (whichfork == XFS_DATA_FORK) ? MAXEXTNUM : MAXAEXTNUM;
+	minleafrecs = mp->m_bmap_dmnr[0];
+	minnoderecs = mp->m_bmap_dmnr[1];
+	sz = (whichfork == XFS_DATA_FORK) ?
+		mp->m_attroffset :
+		mp->m_sb.sb_inodesize - mp->m_attroffset;
+	maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0);
+	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+	for (level = 1; maxblocks > 1; level++) {
+		if (maxblocks <= maxrootrecs)
+			maxblocks = 1;
+		else
+			maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+	}
+	mp->m_bm_maxlevels[whichfork] = level;
+}
+
+/*
+ * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
+ * caller.  Frees all the extents that need freeing, which must be done
+ * last due to locking considerations.	We never free any extents in
+ * the first transaction.  This is to allow the caller to make the first
+ * transaction a synchronous one so that the pointers to the data being
+ * broken in this transaction will be permanent before the data is actually
+ * freed.  This is necessary to prevent blocks from being reallocated
+ * and written to before the free and reallocation are actually permanent.
+ * We do not just make the first transaction synchronous here, because
+ * there are more efficient ways to gain the same protection in some cases
+ * (see the file truncation code).
+ *
+ * Return 1 if the given transaction was committed and a new one
+ * started, and 0 otherwise in the committed parameter.
+ */
+/*ARGSUSED*/
+int						/* error */
+xfs_bmap_finish(
+	xfs_trans_t		**tp,		/* transaction pointer addr */
+	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
+	xfs_fsblock_t		firstblock,	/* controlled ag for allocs */
+	int			*committed)	/* xact committed or not */
+{
+	xfs_efd_log_item_t	*efd;		/* extent free data */
+	xfs_efi_log_item_t	*efi;		/* extent free intention */
+	int			error;		/* error return value */
+	xfs_bmap_free_item_t	*free;		/* free extent list item */
+	unsigned int		logres;		/* new log reservation */
+	unsigned int		logcount;	/* new log count */
+	xfs_mount_t		*mp;		/* filesystem mount structure */
+	xfs_bmap_free_item_t	*next;		/* next item on free list */
+	xfs_trans_t		*ntp;		/* new transaction pointer */
+
+	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+	if (flist->xbf_count == 0) {
+		*committed = 0;
+		return 0;
+	}
+	ntp = *tp;
+	efi = xfs_trans_get_efi(ntp, flist->xbf_count);
+	for (free = flist->xbf_first; free; free = free->xbfi_next)
+		xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
+			free->xbfi_blockcount);
+	logres = ntp->t_log_res;
+	logcount = ntp->t_log_count;
+	ntp = xfs_trans_dup(*tp);
+	error = xfs_trans_commit(*tp, 0, NULL);
+	*tp = ntp;
+	*committed = 1;
+	/*
+	 * We have a new transaction, so we should return committed=1,
+	 * even though we're returning an error.
+	 */
+	if (error) {
+		return error;
+	}
+	if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES,
+			logcount)))
+		return error;
+	efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
+	for (free = flist->xbf_first; free != NULL; free = next) {
+		next = free->xbfi_next;
+		if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
+				free->xbfi_blockcount))) {
+			/*
+			 * The bmap free list will be cleaned up at a
+			 * higher level.  The EFI will be canceled when
+			 * this transaction is aborted.
+			 * Need to force shutdown here to make sure it
+			 * happens, since this transaction may not be
+			 * dirty yet.
+			 */
+			mp = ntp->t_mountp;
+			if (!XFS_FORCED_SHUTDOWN(mp))
+				xfs_force_shutdown(mp,
+						   (error == EFSCORRUPTED) ?
+						   XFS_CORRUPT_INCORE :
+						   XFS_METADATA_IO_ERROR);
+			return error;
+		}
+		xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
+			free->xbfi_blockcount);
+		xfs_bmap_del_free(flist, NULL, free);
+	}
+	return 0;
+}
+
+/*
+ * Free up any items left in the list.
+ */
+void
+xfs_bmap_cancel(
+	xfs_bmap_free_t		*flist) /* list of bmap_free_items */
+{
+	xfs_bmap_free_item_t	*free;	/* free list item */
+	xfs_bmap_free_item_t	*next;
+
+	if (flist->xbf_count == 0)
+		return;
+	ASSERT(flist->xbf_first != NULL);
+	for (free = flist->xbf_first; free; free = next) {
+		next = free->xbfi_next;
+		xfs_bmap_del_free(flist, NULL, free);
+	}
+	ASSERT(flist->xbf_count == 0);
+}
+
+/*
+ * Returns EINVAL if the specified file is not swappable.
+ */
+int						/* error */
+xfs_bmap_check_swappable(
+	xfs_inode_t	*ip)			/* incore inode */
+{
+	xfs_bmbt_rec_t	*base;			/* base of extent array */
+	xfs_bmbt_rec_t	*ep;			/* pointer to an extent entry */
+	xfs_fileoff_t	end_fsb;		/* last block of file within size */
+	xfs_bmbt_irec_t ext;			/* extent list entry, decoded */
+	xfs_ifork_t	*ifp;			/* inode fork pointer */
+	xfs_fileoff_t	lastaddr;		/* last block number seen */
+	xfs_extnum_t	nextents;		/* number of extent entries */
+	int		retval = 0;		/* return value */
+
+	xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+
+	/*
+	 * Check for a zero length file.
+	 */
+	if (ip->i_d.di_size == 0)
+		goto check_done;
+
+	ASSERT(XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_BTREE ||
+	       XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS);
+
+	ifp = &ip->i_df;
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (retval = xfs_iread_extents(NULL, ip, XFS_DATA_FORK)))
+		goto check_done;
+	/*
+	 * Scan extents until the file size is reached. Look for
+	 * holes or unwritten extents, since I/O to these would cause
+	 * a transaction.
+	 */
+	end_fsb = XFS_B_TO_FSB(ip->i_mount, ip->i_d.di_size);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	base = &ifp->if_u1.if_extents[0];
+	for (lastaddr = 0, ep = base; ep < &base[nextents]; ep++) {
+		xfs_bmbt_get_all(ep, &ext);
+		if (lastaddr < ext.br_startoff ||
+		    ext.br_state != XFS_EXT_NORM) {
+			goto error_done;
+		}
+		if (end_fsb <= (lastaddr = ext.br_startoff +
+						ext.br_blockcount))
+			goto check_done;
+	}
+error_done:
+	retval = XFS_ERROR(EINVAL);
+
+
+check_done:
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	return retval;
+}
+
+/*
+ * Returns the file-relative block number of the first unused block(s)
+ * in the file with at least "len" logically contiguous blocks free.
+ * This is the lowest-address hole if the file has holes, else the first block
+ * past the end of file.
+ * Return 0 if the file is currently local (in-inode).
+ */
+int						/* error */
+xfs_bmap_first_unused(
+	xfs_trans_t	*tp,			/* transaction pointer */
+	xfs_inode_t	*ip,			/* incore inode */
+	xfs_extlen_t	len,			/* size of hole to find */
+	xfs_fileoff_t	*first_unused,		/* unused block */
+	int		whichfork)		/* data or attr fork */
+{
+	xfs_bmbt_rec_t	*base;			/* base of extent array */
+	xfs_bmbt_rec_t	*ep;			/* pointer to an extent entry */
+	int		error;			/* error return value */
+	xfs_ifork_t	*ifp;			/* inode fork pointer */
+	xfs_fileoff_t	lastaddr;		/* last block number seen */
+	xfs_fileoff_t	lowest;			/* lowest useful block */
+	xfs_fileoff_t	max;			/* starting useful block */
+	xfs_fileoff_t	off;			/* offset for this block */
+	xfs_extnum_t	nextents;		/* number of extent entries */
+
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE ||
+	       XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ||
+	       XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		*first_unused = 0;
+		return 0;
+	}
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		return error;
+	lowest = *first_unused;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	base = &ifp->if_u1.if_extents[0];
+	for (lastaddr = 0, max = lowest, ep = base;
+	     ep < &base[nextents];
+	     ep++) {
+		off = xfs_bmbt_get_startoff(ep);
+		/*
+		 * See if the hole before this extent will work.
+		 */
+		if (off >= lowest + len && off - max >= len) {
+			*first_unused = max;
+			return 0;
+		}
+		lastaddr = off + xfs_bmbt_get_blockcount(ep);
+		max = XFS_FILEOFF_MAX(lastaddr, lowest);
+	}
+	*first_unused = max;
+	return 0;
+}
+
+/*
+ * Returns the file-relative block number of the last block + 1 before
+ * last_block (input value) in the file.
+ * This is not based on i_size, it is based on the extent list.
+ * Returns 0 for local files, as they do not have an extent list.
+ */
+int						/* error */
+xfs_bmap_last_before(
+	xfs_trans_t	*tp,			/* transaction pointer */
+	xfs_inode_t	*ip,			/* incore inode */
+	xfs_fileoff_t	*last_block,		/* last block */
+	int		whichfork)		/* data or attr fork */
+{
+	xfs_fileoff_t	bno;			/* input file offset */
+	int		eof;			/* hit end of file */
+	xfs_bmbt_rec_t	*ep;			/* pointer to last extent */
+	int		error;			/* error return value */
+	xfs_bmbt_irec_t got;			/* current extent value */
+	xfs_ifork_t	*ifp;			/* inode fork pointer */
+	xfs_extnum_t	lastx;			/* last extent used */
+	xfs_bmbt_irec_t prev;			/* previous extent value */
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
+	       return XFS_ERROR(EIO);
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		*last_block = 0;
+		return 0;
+	}
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		return error;
+	bno = *last_block - 1;
+	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+		&prev);
+	if (eof || xfs_bmbt_get_startoff(ep) > bno) {
+		if (prev.br_startoff == NULLFILEOFF)
+			*last_block = 0;
+		else
+			*last_block = prev.br_startoff + prev.br_blockcount;
+	}
+	/*
+	 * Otherwise *last_block is already the right answer.
+	 */
+	return 0;
+}
+
+/*
+ * Returns the file-relative block number of the first block past eof in
+ * the file.  This is not based on i_size, it is based on the extent list.
+ * Returns 0 for local files, as they do not have an extent list.
+ */
+int						/* error */
+xfs_bmap_last_offset(
+	xfs_trans_t	*tp,			/* transaction pointer */
+	xfs_inode_t	*ip,			/* incore inode */
+	xfs_fileoff_t	*last_block,		/* last block */
+	int		whichfork)		/* data or attr fork */
+{
+	xfs_bmbt_rec_t	*base;			/* base of extent array */
+	xfs_bmbt_rec_t	*ep;			/* pointer to last extent */
+	int		error;			/* error return value */
+	xfs_ifork_t	*ifp;			/* inode fork pointer */
+	xfs_extnum_t	nextents;		/* number of extent entries */
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
+	       return XFS_ERROR(EIO);
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		*last_block = 0;
+		return 0;
+	}
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		return error;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	if (!nextents) {
+		*last_block = 0;
+		return 0;
+	}
+	base = &ifp->if_u1.if_extents[0];
+	ASSERT(base != NULL);
+	ep = &base[nextents - 1];
+	*last_block = xfs_bmbt_get_startoff(ep) + xfs_bmbt_get_blockcount(ep);
+	return 0;
+}
+
+/*
+ * Returns whether the selected fork of the inode has exactly one
+ * block or not.  For the data fork we check this matches di_size,
+ * implying the file's range is 0..bsize-1.
+ */
+int					/* 1=>1 block, 0=>otherwise */
+xfs_bmap_one_block(
+	xfs_inode_t	*ip,		/* incore inode */
+	int		whichfork)	/* data or attr fork */
+{
+	xfs_bmbt_rec_t	*ep;		/* ptr to fork's extent */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	int		rval;		/* return value */
+	xfs_bmbt_irec_t s;		/* internal version of extent */
+
+#ifndef DEBUG
+	if (whichfork == XFS_DATA_FORK)
+		return ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize;
+#endif	/* !DEBUG */
+	if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
+		return 0;
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+		return 0;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	ep = ifp->if_u1.if_extents;
+	xfs_bmbt_get_all(ep, &s);
+	rval = s.br_startoff == 0 && s.br_blockcount == 1;
+	if (rval && whichfork == XFS_DATA_FORK)
+		ASSERT(ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
+	return rval;
+}
+
+/*
+ * Read in the extents to if_extents.
+ * All inode fields are set up by caller, we just traverse the btree
+ * and copy the records in. If the file system cannot contain unwritten
+ * extents, the records are checked for no "state" flags.
+ */
+int					/* error */
+xfs_bmap_read_extents(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_inode_t		*ip,	/* incore inode */
+	int			whichfork) /* data or attr fork */
+{
+	xfs_bmbt_block_t	*block; /* current btree block */
+	xfs_fsblock_t		bno;	/* block # of "block" */
+	xfs_buf_t		*bp;	/* buffer for "block" */
+	int			error;	/* error return value */
+	xfs_exntfmt_t		exntf;	/* XFS_EXTFMT_NOSTATE, if checking */
+#ifdef XFS_BMAP_TRACE
+	static char		fname[] = "xfs_bmap_read_extents";
+#endif
+	xfs_extnum_t		i;	/* index into the extents list */
+	xfs_ifork_t		*ifp;	/* fork structure */
+	int			level;	/* btree level, for checking */
+	xfs_mount_t		*mp;	/* file system mount structure */
+	xfs_bmbt_ptr_t		*pp;	/* pointer to block address */
+	/* REFERENCED */
+	xfs_extnum_t		room;	/* number of entries there's room for */
+	xfs_bmbt_rec_t		*trp;	/* target record pointer */
+
+	bno = NULLFSBLOCK;
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
+					XFS_EXTFMT_INODE(ip);
+	block = ifp->if_broot;
+	/*
+	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+	 */
+	ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0);
+	level = INT_GET(block->bb_level, ARCH_CONVERT);
+	pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+	ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO);
+	ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks);
+	bno = INT_GET(*pp, ARCH_CONVERT);
+	/*
+	 * Go down the tree until leaf level is reached, following the first
+	 * pointer (leftmost) at each level.
+	 */
+	while (level-- > 0) {
+		if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+				XFS_BMAP_BTREE_REF)))
+			return error;
+		block = XFS_BUF_TO_BMBT_BLOCK(bp);
+		XFS_WANT_CORRUPTED_GOTO(
+			XFS_BMAP_SANITY_CHECK(mp, block, level),
+			error0);
+		if (level == 0)
+			break;
+		pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block,
+			1, mp->m_bmap_dmxr[1]);
+#ifndef __KERNEL__
+		XFS_WANT_CORRUPTED_GOTO(
+			XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)),
+			error0);
+#else	/* additional, temporary, debugging code */
+		if (!(XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)))) {
+			cmn_err(CE_NOTE,
+			"xfs_bmap_read_extents: FSB Sanity Check:");
+			if (!(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount))
+				cmn_err(CE_NOTE,
+					"bad AG count %d < agcount %d",
+					XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)),
+					mp->m_sb.sb_agcount);
+			if (!(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks))
+				cmn_err(CE_NOTE,
+					"bad AG BNO %d < %d",
+					XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)),
+					mp->m_sb.sb_agblocks);
+			error = XFS_ERROR(EFSCORRUPTED);
+			goto error0;
+		}
+#endif
+		bno = INT_GET(*pp, ARCH_CONVERT);
+		xfs_trans_brelse(tp, bp);
+	}
+	/*
+	 * Here with bp and block set to the leftmost leaf node in the tree.
+	 */
+	room = ifp->if_bytes / (uint)sizeof(*trp);
+	trp = ifp->if_u1.if_extents;
+	i = 0;
+	/*
+	 * Loop over all leaf nodes.  Copy information to the extent list.
+	 */
+	for (;;) {
+		xfs_bmbt_rec_t	*frp;
+		xfs_fsblock_t	nextbno;
+		xfs_extnum_t	num_recs;
+
+
+		num_recs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+		if (i + num_recs > room) {
+			ASSERT(i + num_recs <= room);
+			xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+				"corrupt dinode %Lu, (btree extents).  "
+				"Unmount and run xfs_repair.",
+				(unsigned long long) ip->i_ino);
+			goto error0;
+		}
+#ifndef __KERNEL__
+		XFS_WANT_CORRUPTED_GOTO(
+			XFS_BMAP_SANITY_CHECK(mp, block, 0),
+			error0);
+#else	/* additional, temporary, debugging code */
+		if (!(XFS_BMAP_SANITY_CHECK(mp, block, 0))) {
+			cmn_err(CE_NOTE,
+			"xfs_bmap_read_extents: BMAP Sanity Check:");
+			if (!(INT_GET(block->bb_magic, ARCH_CONVERT) == XFS_BMAP_MAGIC))
+				cmn_err(CE_NOTE,
+					"bb_magic 0x%x",
+					INT_GET(block->bb_magic, ARCH_CONVERT));
+			if (!(INT_GET(block->bb_level, ARCH_CONVERT) == level))
+				cmn_err(CE_NOTE,
+					"bb_level %d",
+					INT_GET(block->bb_level, ARCH_CONVERT));
+			if (!(INT_GET(block->bb_numrecs, ARCH_CONVERT) > 0))
+				cmn_err(CE_NOTE,
+					"bb_numrecs %d",
+					INT_GET(block->bb_numrecs, ARCH_CONVERT));
+			if (!(INT_GET(block->bb_numrecs, ARCH_CONVERT) <= (mp)->m_bmap_dmxr[(level) != 0]))
+				cmn_err(CE_NOTE,
+					"bb_numrecs %d < m_bmap_dmxr[] %d",
+					INT_GET(block->bb_numrecs, ARCH_CONVERT),
+					(mp)->m_bmap_dmxr[(level) != 0]);
+			error = XFS_ERROR(EFSCORRUPTED);
+			goto error0;
+		}
+#endif
+		/*
+		 * Read-ahead the next leaf block, if any.
+		 */
+		nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
+		if (nextbno != NULLFSBLOCK)
+			xfs_btree_reada_bufl(mp, nextbno, 1);
+		/*
+		 * Copy records into the extent list.
+		 */
+		frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt,
+			block, 1, mp->m_bmap_dmxr[0]);
+		bcopy(frp, trp, num_recs * sizeof(*frp));
+		if (exntf == XFS_EXTFMT_NOSTATE) {
+			/*
+			 * Check all attribute bmap btree records and
+			 * any "older" data bmap btree records for a
+			 * set bit in the "extent flag" position.
+			 */
+			if (xfs_check_nostate_extents(trp, num_recs)) {
+				goto error0;
+			}
+		}
+		trp += num_recs;
+		i += num_recs;
+		xfs_trans_brelse(tp, bp);
+		bno = nextbno;
+		/*
+		 * If we've reached the end, stop.
+		 */
+		if (bno == NULLFSBLOCK)
+			break;
+		if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+				XFS_BMAP_BTREE_REF)))
+			return error;
+		block = XFS_BUF_TO_BMBT_BLOCK(bp);
+	}
+	ASSERT(i == ifp->if_bytes / (uint)sizeof(*trp));
+	ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
+	xfs_bmap_trace_exlist(fname, ip, i, whichfork);
+	return 0;
+error0:
+	xfs_trans_brelse(tp, bp);
+	return XFS_ERROR(EFSCORRUPTED);
+}
+
+#ifdef XFS_BMAP_TRACE
+/*
+ * Add bmap trace insert entries for all the contents of the extent list.
+ */
+void
+xfs_bmap_trace_exlist(
+	char		*fname,		/* function name */
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_extnum_t	cnt,		/* count of entries in the list */
+	int		whichfork)	/* data or attr fork */
+{
+	xfs_bmbt_rec_t	*base;		/* base of extent list */
+	xfs_bmbt_rec_t	*ep;		/* current entry in extent list */
+	xfs_extnum_t	idx;		/* extent list entry number */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_bmbt_irec_t s;		/* extent list record */
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(cnt == ifp->if_bytes / (uint)sizeof(*base));
+	base = ifp->if_u1.if_extents;
+	for (idx = 0, ep = base; idx < cnt; idx++, ep++) {
+		xfs_bmbt_get_all(ep, &s);
+		xfs_bmap_trace_insert(fname, "exlist", ip, idx, 1, &s, NULL,
+			whichfork);
+	}
+}
+#endif
+
+#ifdef DEBUG
+/*
+ * Validate that the bmbt_irecs being returned from bmapi are valid
+ * given the callers original parameters.  Specifically check the
+ * ranges of the returned irecs to ensure that they only extent beyond
+ * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
+ */
+STATIC void
+xfs_bmap_validate_ret(
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	int			flags,
+	xfs_bmbt_irec_t		*mval,
+	int			nmap,
+	int			ret_nmap)
+{
+	int			i;		/* index to map values */
+
+	ASSERT(ret_nmap <= nmap);
+
+	for (i = 0; i < ret_nmap; i++) {
+		ASSERT(mval[i].br_blockcount > 0);
+		if (!(flags & XFS_BMAPI_ENTIRE)) {
+			ASSERT(mval[i].br_startoff >= bno);
+			ASSERT(mval[i].br_blockcount <= len);
+			ASSERT(mval[i].br_startoff + mval[i].br_blockcount <=
+			       bno + len);
+		} else {
+			ASSERT(mval[i].br_startoff < bno + len);
+			ASSERT(mval[i].br_startoff + mval[i].br_blockcount >
+			       bno);
+		}
+		ASSERT(i == 0 ||
+		       mval[i - 1].br_startoff + mval[i - 1].br_blockcount ==
+		       mval[i].br_startoff);
+		if ((flags & XFS_BMAPI_WRITE) && !(flags & XFS_BMAPI_DELAY))
+			ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK &&
+			       mval[i].br_startblock != HOLESTARTBLOCK);
+		ASSERT(mval[i].br_state == XFS_EXT_NORM ||
+		       mval[i].br_state == XFS_EXT_UNWRITTEN);
+	}
+}
+#endif /* DEBUG */
+
+
+/*
+ * Map file blocks to filesystem blocks.
+ * File range is given by the bno/len pair.
+ * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
+ * into a hole or past eof.
+ * Only allocates blocks from a single allocation group,
+ * to avoid locking problems.
+ * The returned value in "firstblock" from the first call in a transaction
+ * must be remembered and presented to subsequent calls in "firstblock".
+ * An upper bound for the number of blocks to be allocated is supplied to
+ * the first call in "total"; if no allocation group has that many free
+ * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
+ */
+int					/* error */
+xfs_bmapi(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*ip,		/* incore inode */
+	xfs_fileoff_t	bno,		/* starting file offs. mapped */
+	xfs_filblks_t	len,		/* length to map in file */
+	int		flags,		/* XFS_BMAPI_... */
+	xfs_fsblock_t	*firstblock,	/* first allocated block
+					   controls a.g. for allocs */
+	xfs_extlen_t	total,		/* total blocks needed */
+	xfs_bmbt_irec_t *mval,		/* output: map values */
+	int		*nmap,		/* i/o: mval size/count */
+	xfs_bmap_free_t *flist)		/* i/o: list extents to free */
+{
+	xfs_fsblock_t	abno;		/* allocated block number */
+	xfs_extlen_t	alen;		/* allocated extent length */
+	xfs_fileoff_t	aoff;		/* allocated file offset */
+	xfs_bmalloca_t	bma;		/* args for xfs_bmap_alloc */
+	char		contig;		/* allocation must be one extent */
+	xfs_btree_cur_t *cur;		/* bmap btree cursor */
+	char		delay;		/* this request is for delayed alloc */
+	xfs_fileoff_t	end;		/* end of mapped file region */
+	int		eof;		/* we've hit the end of extent list */
+	xfs_bmbt_rec_t	*ep;		/* extent list entry pointer */
+	int		error;		/* error return */
+	char		exact;		/* don't do all of wasdelayed extent */
+	xfs_bmbt_irec_t got;		/* current extent list record */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_extlen_t	indlen;		/* indirect blocks length */
+	char		inhole;		/* current location is hole in file */
+	xfs_extnum_t	lastx;		/* last useful extent number */
+	int		logflags;	/* flags for transaction logging */
+	xfs_extlen_t	minleft;	/* min blocks left after allocation */
+	xfs_extlen_t	minlen;		/* min allocation size */
+	xfs_mount_t	*mp;		/* xfs mount structure */
+	int		n;		/* current extent index */
+	int		nallocs;	/* number of extents alloc\'d */
+	xfs_extnum_t	nextents;	/* number of extents in file */
+	xfs_fileoff_t	obno;		/* old block number (offset) */
+	xfs_bmbt_irec_t prev;		/* previous extent list record */
+	char		stateless;	/* ignore state flag set */
+	int		tmp_logflags;	/* temp flags holder */
+	char		trim;		/* output trimmed to match range */
+	char		userdata;	/* allocating non-metadata */
+	char		wasdelay;	/* old extent was delayed */
+	int		whichfork;	/* data or attr fork */
+	char		wr;		/* this is a write request */
+	char		rsvd;		/* OK to allocate reserved blocks */
+#ifdef DEBUG
+	xfs_fileoff_t	orig_bno;	/* original block number value */
+	int		orig_flags;	/* original flags arg value */
+	xfs_filblks_t	orig_len;	/* original value of len arg */
+	xfs_bmbt_irec_t *orig_mval;	/* original value of mval */
+	int		orig_nmap;	/* original value of *nmap */
+
+	orig_bno = bno;
+	orig_len = len;
+	orig_flags = flags;
+	orig_mval = mval;
+	orig_nmap = *nmap;
+#endif
+	ASSERT(*nmap >= 1);
+	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP || !(flags & XFS_BMAPI_WRITE));
+	whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+		XFS_ATTR_FORK : XFS_DATA_FORK;
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) {
+#ifdef __KERNEL__	/* additional, temporary, debugging code */
+		cmn_err(CE_NOTE,
+			"EFSCORRUPTED returned from file %s line %d",
+			__FILE__, __LINE__);
+#endif
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	mp = ip->i_mount;
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+	if ((wr = (flags & XFS_BMAPI_WRITE)) != 0)
+		XFS_STATS_INC(xfsstats.xs_blk_mapw);
+	else
+		XFS_STATS_INC(xfsstats.xs_blk_mapr);
+	delay = (flags & XFS_BMAPI_DELAY) != 0;
+	trim = (flags & XFS_BMAPI_ENTIRE) == 0;
+	userdata = (flags & XFS_BMAPI_METADATA) == 0;
+	exact = (flags & XFS_BMAPI_EXACT) != 0;
+	rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0;
+	contig = (flags & XFS_BMAPI_CONTIG) != 0;
+	/*
+	 * stateless is used to combine extents which
+	 * differ only due to the state of the extents.
+	 * This technique is used from xfs_getbmap()
+	 * when the caller does not wish to see the
+	 * separation (which is the default).
+	 *
+	 * This technique is also used when writing a
+	 * buffer which has been partially written,
+	 * (usually by being flushed during a chunkread),
+	 * to ensure one write takes place. This also
+	 * prevents a change in the xfs inode extents at
+	 * this time, intentionally. This change occurs
+	 * on completion of the write operation, in
+	 * xfs_strat_comp(), where the xfs_bmapi() call
+	 * is transactioned, and the extents combined.
+	 */
+	stateless = (flags & XFS_BMAPI_IGSTATE) != 0;
+	if (stateless && wr)	/* if writing unwritten space, no */
+		wr = 0;		/* allocations are allowed */
+	ASSERT(wr || !delay);
+	logflags = 0;
+	nallocs = 0;
+	cur = NULL;
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		ASSERT(wr && tp);
+		if ((error = xfs_bmap_local_to_extents(tp, ip, firstblock, total,
+				&logflags, whichfork)))
+			goto error0;
+	}
+	if (wr && *firstblock == NULLFSBLOCK) {
+		if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
+			minleft = INT_GET(ifp->if_broot->bb_level, ARCH_CONVERT) + 1;
+		else
+			minleft = 1;
+	} else
+		minleft = 0;
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		goto error0;
+	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+		&prev);
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	n = 0;
+	end = bno + len;
+	obno = bno;
+	bma.ip = NULL;
+	while (bno < end && n < *nmap) {
+		/*
+		 * Reading past eof, act as though there's a hole
+		 * up to end.
+		 */
+		if (eof && !wr)
+			got.br_startoff = end;
+		inhole = eof || got.br_startoff > bno;
+		wasdelay = wr && !inhole && !delay &&
+			ISNULLSTARTBLOCK(got.br_startblock);
+		/*
+		 * First, deal with the hole before the allocated space
+		 * that we found, if any.
+		 */
+		if (wr && (inhole || wasdelay)) {
+			/*
+			 * For the wasdelay case, we could also just
+			 * allocate the stuff asked for in this bmap call
+			 * but that wouldn't be as good.
+			 */
+			if (wasdelay && !exact) {
+				alen = (xfs_extlen_t)got.br_blockcount;
+				aoff = got.br_startoff;
+				if (lastx != NULLEXTNUM && lastx) {
+					ep = &ifp->if_u1.if_extents[lastx - 1];
+					xfs_bmbt_get_all(ep, &prev);
+				}
+			} else if (wasdelay) {
+				alen = (xfs_extlen_t)
+					XFS_FILBLKS_MIN(len,
+						(got.br_startoff +
+						 got.br_blockcount) - bno);
+				aoff = bno;
+			} else {
+				alen = (xfs_extlen_t)
+					XFS_FILBLKS_MIN(len, MAXEXTLEN);
+				if (!eof)
+					alen = (xfs_extlen_t)
+						XFS_FILBLKS_MIN(alen,
+							got.br_startoff - bno);
+				aoff = bno;
+			}
+			minlen = contig ? alen : 1;
+			if (delay) {
+				indlen = (xfs_extlen_t)
+					xfs_bmap_worst_indlen(ip, alen);
+				ASSERT(indlen > 0);
+				/*
+				 * Make a transaction-less quota reservation for
+				 * delayed allocation blocks. This number gets
+				 * adjusted later.
+				 * We return EDQUOT if we haven't allocated
+				 * blks already inside this loop;
+				 */
+				if (XFS_IS_QUOTA_ON(ip->i_mount) &&
+				    xfs_trans_reserve_blkquota(NULL, ip,
+					    (long)alen)) {
+					if (n == 0) {
+						*nmap = 0;
+						ASSERT(cur == NULL);
+						return XFS_ERROR(EDQUOT);
+					}
+					break;
+				}
+				if (xfs_mod_incore_sb(ip->i_mount,
+						XFS_SBS_FDBLOCKS,
+						-(alen + indlen), rsvd)) {
+					if (XFS_IS_QUOTA_ON(ip->i_mount))
+						xfs_trans_unreserve_blkquota(
+							NULL, ip, (long)alen);
+					break;
+				}
+				ip->i_delayed_blks += alen;
+				abno = NULLSTARTBLOCK(indlen);
+			} else {
+				/*
+				 * If first time, allocate and fill in
+				 * once-only bma fields.
+				 */
+				if (bma.ip == NULL) {
+					bma.tp = tp;
+					bma.ip = ip;
+					bma.prevp = &prev;
+					bma.gotp = &got;
+					bma.total = total;
+					bma.userdata = 0;
+				}
+				/* Indicate if this is the first user data
+				 * in the file, or just any user data.
+				 */
+				if (userdata) {
+					bma.userdata = (aoff == 0) ?
+						XFS_ALLOC_INITIAL_USER_DATA :
+						XFS_ALLOC_USERDATA;
+				}
+				/*
+				 * Fill in changeable bma fields.
+				 */
+				bma.eof = eof;
+				bma.firstblock = *firstblock;
+				bma.alen = alen;
+				bma.off = aoff;
+				bma.wasdel = wasdelay;
+				bma.minlen = minlen;
+				bma.low = flist->xbf_low;
+				bma.minleft = minleft;
+				/*
+				 * Only want to do the alignment at the
+				 * eof if it is userdata and allocation length
+				 * is larger than a stripe unit.
+				 */
+				if (mp->m_dalign && alen >= mp->m_dalign &&
+				    userdata && whichfork == XFS_DATA_FORK) {
+					if ((error = xfs_bmap_isaeof(ip, aoff,
+							whichfork, &bma.aeof)))
+						goto error0;
+				} else
+					bma.aeof = 0;
+				/*
+				 * Call allocator.
+				 */
+				if ((error = xfs_bmap_alloc(&bma)))
+					goto error0;
+				/*
+				 * Copy out result fields.
+				 */
+				abno = bma.rval;
+				if ((flist->xbf_low = bma.low))
+					minleft = 0;
+				alen = bma.alen;
+				aoff = bma.off;
+				ASSERT(*firstblock == NULLFSBLOCK ||
+				       XFS_FSB_TO_AGNO(ip->i_mount,
+					       *firstblock) ==
+				       XFS_FSB_TO_AGNO(ip->i_mount,
+					       bma.firstblock) ||
+				       (flist->xbf_low &&
+					XFS_FSB_TO_AGNO(ip->i_mount,
+						*firstblock) <
+					XFS_FSB_TO_AGNO(ip->i_mount,
+						bma.firstblock)));
+				*firstblock = bma.firstblock;
+				if (cur)
+					cur->bc_private.b.firstblock =
+						*firstblock;
+				if (abno == NULLFSBLOCK)
+					break;
+				if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
+					cur = xfs_btree_init_cursor(ip->i_mount,
+						tp, NULL, 0, XFS_BTNUM_BMAP,
+						ip, whichfork);
+					cur->bc_private.b.firstblock =
+						*firstblock;
+					cur->bc_private.b.flist = flist;
+				}
+				/*
+				 * Bump the number of extents we've allocated
+				 * in this call.
+				 */
+				nallocs++;
+			}
+			if (cur)
+				cur->bc_private.b.flags =
+					wasdelay ? XFS_BTCUR_BPRV_WASDEL : 0;
+			got.br_startoff = aoff;
+			got.br_startblock = abno;
+			got.br_blockcount = alen;
+			got.br_state = XFS_EXT_NORM;	/* assume normal */
+			/*
+			 * Determine state of extent, and the filesystem.
+			 * A wasdelay extent has been initialized, so
+			 * shouldn't be flagged as unwritten.
+			 */
+			if (wr && XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
+				if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
+					got.br_state = XFS_EXT_UNWRITTEN;
+			}
+			error = xfs_bmap_add_extent(ip, lastx, &cur, &got,
+				firstblock, flist, &tmp_logflags, whichfork,
+				rsvd);
+			logflags |= tmp_logflags;
+			if (error)
+				goto error0;
+			lastx = ifp->if_lastex;
+			ep = &ifp->if_u1.if_extents[lastx];
+			nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+			xfs_bmbt_get_all(ep, &got);
+			ASSERT(got.br_startoff <= aoff);
+			ASSERT(got.br_startoff + got.br_blockcount >=
+				aoff + alen);
+#ifdef DEBUG
+			if (delay) {
+				ASSERT(ISNULLSTARTBLOCK(got.br_startblock));
+				ASSERT(STARTBLOCKVAL(got.br_startblock) > 0);
+			}
+			ASSERT(got.br_state == XFS_EXT_NORM ||
+			       got.br_state == XFS_EXT_UNWRITTEN);
+#endif
+			/*
+			 * Fall down into the found allocated space case.
+			 */
+		} else if (inhole) {
+			/*
+			 * Reading in a hole.
+			 */
+			mval->br_startoff = bno;
+			mval->br_startblock = HOLESTARTBLOCK;
+			mval->br_blockcount =
+				XFS_FILBLKS_MIN(len, got.br_startoff - bno);
+			mval->br_state = XFS_EXT_NORM;
+			bno += mval->br_blockcount;
+			len -= mval->br_blockcount;
+			mval++;
+			n++;
+			continue;
+		}
+		/*
+		 * Then deal with the allocated space we found.
+		 */
+		ASSERT(ep != NULL);
+		if (trim && (got.br_startoff + got.br_blockcount > obno)) {
+			if (obno > bno)
+				bno = obno;
+			ASSERT((bno >= obno) || (n == 0));
+			ASSERT(bno < end);
+			mval->br_startoff = bno;
+			if (ISNULLSTARTBLOCK(got.br_startblock)) {
+				ASSERT(!wr || delay);
+				mval->br_startblock = DELAYSTARTBLOCK;
+			} else
+				mval->br_startblock =
+					got.br_startblock +
+					(bno - got.br_startoff);
+			/*
+			 * Return the minimum of what we got and what we
+			 * asked for for the length.  We can use the len
+			 * variable here because it is modified below
+			 * and we could have been there before coming
+			 * here if the first part of the allocation
+			 * didn't overlap what was asked for.
+			 */
+			mval->br_blockcount =
+				XFS_FILBLKS_MIN(end - bno, got.br_blockcount -
+					(bno - got.br_startoff));
+			mval->br_state = got.br_state;
+			ASSERT(mval->br_blockcount <= len);
+		} else {
+			*mval = got;
+			if (ISNULLSTARTBLOCK(mval->br_startblock)) {
+				ASSERT(!wr || delay);
+				mval->br_startblock = DELAYSTARTBLOCK;
+			}
+		}
+
+		/*
+		 * Check if writing previously allocated but
+		 * unwritten extents.
+		 */
+		if (wr && mval->br_state == XFS_EXT_UNWRITTEN &&
+		    ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) {
+			/*
+			 * Modify (by adding) the state flag, if writing.
+			 */
+			ASSERT(mval->br_blockcount <= len);
+			if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
+				cur = xfs_btree_init_cursor(ip->i_mount,
+					tp, NULL, 0, XFS_BTNUM_BMAP,
+					ip, whichfork);
+				cur->bc_private.b.firstblock =
+					*firstblock;
+				cur->bc_private.b.flist = flist;
+			}
+			mval->br_state = XFS_EXT_NORM;
+			error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
+				firstblock, flist, &tmp_logflags, whichfork,
+				rsvd);
+			logflags |= tmp_logflags;
+			if (error)
+				goto error0;
+			lastx = ifp->if_lastex;
+			ep = &ifp->if_u1.if_extents[lastx];
+			nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+			xfs_bmbt_get_all(ep, &got);
+			/*
+			 * We may have combined previously unwritten
+			 * space with written space, so generate
+			 * another request.
+			 */
+			if (mval->br_blockcount < len)
+				continue;
+		}
+
+		ASSERT(!trim ||
+		       ((mval->br_startoff + mval->br_blockcount) <= end));
+		ASSERT(!trim || (mval->br_blockcount <= len) ||
+		       (mval->br_startoff < obno));
+		bno = mval->br_startoff + mval->br_blockcount;
+		len = end - bno;
+		if (n > 0 && mval->br_startoff == mval[-1].br_startoff) {
+			ASSERT(mval->br_startblock == mval[-1].br_startblock);
+			ASSERT(mval->br_blockcount > mval[-1].br_blockcount);
+			ASSERT(mval->br_state == mval[-1].br_state);
+			mval[-1].br_blockcount = mval->br_blockcount;
+			mval[-1].br_state = mval->br_state;
+		} else if (n > 0 && mval->br_startblock != DELAYSTARTBLOCK &&
+			   mval[-1].br_startblock != DELAYSTARTBLOCK &&
+			   mval[-1].br_startblock != HOLESTARTBLOCK &&
+			   mval->br_startblock ==
+			   mval[-1].br_startblock + mval[-1].br_blockcount &&
+			   (stateless || mval[-1].br_state == mval->br_state)) {
+			ASSERT(mval->br_startoff ==
+			       mval[-1].br_startoff + mval[-1].br_blockcount);
+			mval[-1].br_blockcount += mval->br_blockcount;
+		} else if (n > 0 &&
+			   mval->br_startblock == DELAYSTARTBLOCK &&
+			   mval[-1].br_startblock == DELAYSTARTBLOCK &&
+			   mval->br_startoff ==
+			   mval[-1].br_startoff + mval[-1].br_blockcount) {
+			mval[-1].br_blockcount += mval->br_blockcount;
+			mval[-1].br_state = mval->br_state;
+		} else if (!((n == 0) &&
+			     ((mval->br_startoff + mval->br_blockcount) <=
+			      obno))) {
+			mval++;
+			n++;
+		}
+		/*
+		 * If we're done, stop now.  Stop when we've allocated
+		 * XFS_BMAP_MAX_NMAP extents no matter what.  Otherwise
+		 * the transaction may get too big.
+		 */
+		if (bno >= end || n >= *nmap || nallocs >= *nmap)
+			break;
+		/*
+		 * Else go on to the next record.
+		 */
+		ep++;
+		lastx++;
+		if (lastx >= nextents) {
+			eof = 1;
+			prev = got;
+		} else
+			xfs_bmbt_get_all(ep, &got);
+	}
+	ifp->if_lastex = lastx;
+	*nmap = n;
+	/*
+	 * Transform from btree to extents, give it cur.
+	 */
+	if (tp && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+	    XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+		ASSERT(wr && cur);
+		error = xfs_bmap_btree_to_extents(tp, ip, cur,
+			&tmp_logflags, whichfork, 0);
+		logflags |= tmp_logflags;
+		if (error)
+			goto error0;
+	}
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
+	       XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
+	error = 0;
+
+error0:
+	/*
+	 * Log everything.  Do this after conversion, there's no point in
+	 * logging the extent list if we've converted to btree format.
+	 */
+	if ((logflags & XFS_ILOG_FEXT(whichfork)) &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+		logflags &= ~XFS_ILOG_FEXT(whichfork);
+	else if ((logflags & XFS_ILOG_FBROOT(whichfork)) &&
+		 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
+		logflags &= ~XFS_ILOG_FBROOT(whichfork);
+	/*
+	 * Log whatever the flags say, even if error.  Otherwise we might miss
+	 * detecting a case where the data is changed, there's an error,
+	 * and it's not logged so we don't shutdown when we should.
+	 */
+	if (logflags) {
+		ASSERT(tp && wr);
+		xfs_trans_log_inode(tp, ip, logflags);
+	}
+	if (cur) {
+		if (!error) {
+			ASSERT(*firstblock == NULLFSBLOCK ||
+			       XFS_FSB_TO_AGNO(ip->i_mount, *firstblock) ==
+			       XFS_FSB_TO_AGNO(ip->i_mount,
+				       cur->bc_private.b.firstblock) ||
+			       (flist->xbf_low &&
+				XFS_FSB_TO_AGNO(ip->i_mount, *firstblock) <
+				XFS_FSB_TO_AGNO(ip->i_mount,
+					cur->bc_private.b.firstblock)));
+			*firstblock = cur->bc_private.b.firstblock;
+		}
+		xfs_btree_del_cursor(cur,
+			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	}
+	if (!error)
+		xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
+			orig_nmap, *nmap);
+	return error;
+}
+
+/*
+ * Map file blocks to filesystem blocks, simple version.
+ * One block (extent) only, read-only.
+ * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
+ * For the other flag values, the effect is as if XFS_BMAPI_METADATA
+ * was set and all the others were clear.
+ */
+int						/* error */
+xfs_bmapi_single(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*ip,		/* incore inode */
+	int		whichfork,	/* data or attr fork */
+	xfs_fsblock_t	*fsb,		/* output: mapped block */
+	xfs_fileoff_t	bno)		/* starting file offs. mapped */
+{
+	int		eof;		/* we've hit the end of extent list */
+	int		error;		/* error return */
+	xfs_bmbt_irec_t got;		/* current extent list record */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_extnum_t	lastx;		/* last useful extent number */
+	xfs_bmbt_irec_t prev;		/* previous extent list record */
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) {
+#ifdef __KERNEL__	/* additional, temporary, debugging code */
+		cmn_err(CE_NOTE,
+			"EFSCORRUPTED returned from file %s line %d",
+			__FILE__, __LINE__);
+#endif
+	       return XFS_ERROR(EFSCORRUPTED);
+	}
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return XFS_ERROR(EIO);
+	XFS_STATS_INC(xfsstats.xs_blk_mapr);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		return error;
+	(void)xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+		&prev);
+	/*
+	 * Reading past eof, act as though there's a hole
+	 * up to end.
+	 */
+	if (eof || got.br_startoff > bno) {
+		*fsb = NULLFSBLOCK;
+		return 0;
+	}
+	ASSERT(!ISNULLSTARTBLOCK(got.br_startblock));
+	ASSERT(bno < got.br_startoff + got.br_blockcount);
+	*fsb = got.br_startblock + (bno - got.br_startoff);
+	ifp->if_lastex = lastx;
+	return 0;
+}
+
+/*
+ * Unmap (remove) blocks from a file.
+ * If nexts is nonzero then the number of extents to remove is limited to
+ * that value.	If not all extents in the block range can be removed then
+ * *done is set.
+ */
+int						/* error */
+xfs_bunmapi(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		bno,		/* starting offset to unmap */
+	xfs_filblks_t		len,		/* length to unmap in file */
+	int			flags,		/* misc flags */
+	xfs_extnum_t		nexts,		/* number of extents max */
+	xfs_fsblock_t		*firstblock,	/* first allocated block
+						   controls a.g. for allocs */
+	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
+	int			*done)		/* set if not done yet */
+{
+	int			async;		/* xactions can be async */
+	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
+	xfs_bmbt_irec_t		del;		/* extent being deleted */
+	int			eof;		/* is deleting at eof */
+	xfs_bmbt_rec_t		*ep;		/* extent list entry pointer */
+	int			error;		/* error return value */
+	xfs_extnum_t		extno;		/* extent number in list */
+	xfs_bmbt_irec_t		got;		/* current extent list entry */
+	xfs_ifork_t		*ifp;		/* inode fork pointer */
+	int			isrt;		/* freeing in rt area */
+	xfs_extnum_t		lastx;		/* last extent index used */
+	int			logflags;	/* transaction logging flags */
+	xfs_extlen_t		mod;		/* rt extent offset */
+	xfs_mount_t		*mp;		/* mount structure */
+	xfs_extnum_t		nextents;	/* size of extent list */
+	xfs_bmbt_irec_t		prev;		/* previous extent list entry */
+	xfs_fileoff_t		start;		/* first file offset deleted */
+	int			tmp_logflags;	/* partial logging flags */
+	int			wasdel;		/* was a delayed alloc extent */
+	int			whichfork;	/* data or attribute fork */
+	int			rsvd;		/* OK to allocate reserved blocks */
+	xfs_fsblock_t		sum;
+
+	xfs_bunmap_trace(ip, bno, len, flags, (inst_t *)__return_address);
+	whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+		XFS_ATTR_FORK : XFS_DATA_FORK;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
+#ifdef __KERNEL__	/* additional, temporary, debugging code */
+		cmn_err(CE_NOTE,
+			"EFSCORRUPTED returned from file %s line %d",
+			__FILE__, __LINE__);
+#endif
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	mp = ip->i_mount;
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+	async = flags & XFS_BMAPI_ASYNC;
+	rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0;
+	ASSERT(len > 0);
+	ASSERT(nexts >= 0);
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(tp, ip, whichfork)))
+		return error;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	if (nextents == 0) {
+		*done = 1;
+		return 0;
+	}
+	XFS_STATS_INC(xfsstats.xs_blk_unmap);
+	isrt = (whichfork == XFS_DATA_FORK) &&
+	       (ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
+	start = bno;
+	bno = start + len - 1;
+	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
+		&prev);
+	/*
+	 * Check to see if the given block number is past the end of the
+	 * file, back up to the last block if so...
+	 */
+	if (eof) {
+		ep = &ifp->if_u1.if_extents[--lastx];
+		xfs_bmbt_get_all(ep, &got);
+		bno = got.br_startoff + got.br_blockcount - 1;
+	}
+	logflags = 0;
+	if (ifp->if_flags & XFS_IFBROOT) {
+		ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
+		cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
+			whichfork);
+		cur->bc_private.b.firstblock = *firstblock;
+		cur->bc_private.b.flist = flist;
+		cur->bc_private.b.flags = 0;
+	} else
+		cur = NULL;
+	extno = 0;
+	while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
+	       (nexts == 0 || extno < nexts)) {
+		/*
+		 * Is the found extent after a hole in which bno lives?
+		 * Just back up to the previous extent, if so.
+		 */
+		if (got.br_startoff > bno) {
+			if (--lastx < 0)
+				break;
+			ep--;
+			xfs_bmbt_get_all(ep, &got);
+		}
+		/*
+		 * Is the last block of this extent before the range
+		 * we're supposed to delete?  If so, we're done.
+		 */
+		bno = XFS_FILEOFF_MIN(bno,
+			got.br_startoff + got.br_blockcount - 1);
+		if (bno < start)
+			break;
+		/*
+		 * Then deal with the (possibly delayed) allocated space
+		 * we found.
+		 */
+		ASSERT(ep != NULL);
+		del = got;
+		wasdel = ISNULLSTARTBLOCK(del.br_startblock);
+		if (got.br_startoff < start) {
+			del.br_startoff = start;
+			del.br_blockcount -= start - got.br_startoff;
+			if (!wasdel)
+				del.br_startblock += start - got.br_startoff;
+		}
+		if (del.br_startoff + del.br_blockcount > bno + 1)
+			del.br_blockcount = bno + 1 - del.br_startoff;
+		sum = del.br_startblock + del.br_blockcount;
+		if (isrt &&
+		    (mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
+			/*
+			 * Realtime extent not lined up at the end.
+			 * The extent could have been split into written
+			 * and unwritten pieces, or we could just be
+			 * unmapping part of it.  But we can't really
+			 * get rid of part of a realtime extent.
+			 */
+			if (del.br_state == XFS_EXT_UNWRITTEN ||
+			    !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
+				/*
+				 * This piece is unwritten, or we're not
+				 * using unwritten extents.  Skip over it.
+				 */
+				ASSERT(bno >= mod);
+				bno -= mod > del.br_blockcount ?
+					del.br_blockcount : mod;
+				if (bno < got.br_startoff) {
+					if (--lastx >= 0)
+						xfs_bmbt_get_all(--ep, &got);
+				}
+				continue;
+			}
+			/*
+			 * It's written, turn it unwritten.
+			 * This is better than zeroing it.
+			 */
+			ASSERT(del.br_state == XFS_EXT_NORM);
+			ASSERT(xfs_trans_get_block_res(tp) > 0);
+			/*
+			 * If this spans a realtime extent boundary,
+			 * chop it back to the start of the one we end at.
+			 */
+			if (del.br_blockcount > mod) {
+				del.br_startoff += del.br_blockcount - mod;
+				del.br_startblock += del.br_blockcount - mod;
+				del.br_blockcount = mod;
+			}
+			del.br_state = XFS_EXT_UNWRITTEN;
+			error = xfs_bmap_add_extent(ip, lastx, &cur, &del,
+				firstblock, flist, &logflags, XFS_DATA_FORK, 0);
+			if (error)
+				goto error0;
+			goto nodelete;
+		}
+		if (isrt && (mod = do_mod(del.br_startblock, mp->m_sb.sb_rextsize))) {
+			/*
+			 * Realtime extent is lined up at the end but not
+			 * at the front.  We'll get rid of full extents if
+			 * we can.
+			 */
+			mod = mp->m_sb.sb_rextsize - mod;
+			if (del.br_blockcount > mod) {
+				del.br_blockcount -= mod;
+				del.br_startoff += mod;
+				del.br_startblock += mod;
+			} else if ((del.br_startoff == start &&
+				    (del.br_state == XFS_EXT_UNWRITTEN ||
+				     xfs_trans_get_block_res(tp) == 0)) ||
+				   !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
+				/*
+				 * Can't make it unwritten.  There isn't
+				 * a full extent here so just skip it.
+				 */
+				ASSERT(bno >= del.br_blockcount);
+				bno -= del.br_blockcount;
+				if (bno < got.br_startoff) {
+					if (--lastx >= 0)
+						xfs_bmbt_get_all(--ep, &got);
+				}
+				continue;
+			} else if (del.br_state == XFS_EXT_UNWRITTEN) {
+				/*
+				 * This one is already unwritten.
+				 * It must have a written left neighbor.
+				 * Unwrite the killed part of that one and
+				 * try again.
+				 */
+				ASSERT(lastx > 0);
+				xfs_bmbt_get_all(ep - 1, &prev);
+				ASSERT(prev.br_state == XFS_EXT_NORM);
+				ASSERT(!ISNULLSTARTBLOCK(prev.br_startblock));
+				ASSERT(del.br_startblock ==
+				       prev.br_startblock + prev.br_blockcount);
+				if (prev.br_startoff < start) {
+					mod = start - prev.br_startoff;
+					prev.br_blockcount -= mod;
+					prev.br_startblock += mod;
+					prev.br_startoff = start;
+				}
+				prev.br_state = XFS_EXT_UNWRITTEN;
+				error = xfs_bmap_add_extent(ip, lastx - 1, &cur,
+					&prev, firstblock, flist, &logflags,
+					XFS_DATA_FORK, 0);
+				if (error)
+					goto error0;
+				goto nodelete;
+			} else {
+				ASSERT(del.br_state == XFS_EXT_NORM);
+				del.br_state = XFS_EXT_UNWRITTEN;
+				error = xfs_bmap_add_extent(ip, lastx, &cur,
+					&del, firstblock, flist, &logflags,
+					XFS_DATA_FORK, 0);
+				if (error)
+					goto error0;
+				goto nodelete;
+			}
+		}
+		if (wasdel) {
+			ASSERT(STARTBLOCKVAL(del.br_startblock) > 0);
+			xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS,
+				(int)del.br_blockcount, rsvd);
+			if (XFS_IS_QUOTA_ON(ip->i_mount)) {
+				ASSERT(ip->i_ino != mp->m_sb.sb_uquotino);
+				ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
+				if (!isrt)
+					xfs_trans_unreserve_blkquota(NULL, ip,
+					      (long)del.br_blockcount);
+				else
+					xfs_trans_unreserve_rtblkquota(NULL, ip,
+					      (long)del.br_blockcount);
+			}
+			ip->i_delayed_blks -= del.br_blockcount;
+			if (cur)
+				cur->bc_private.b.flags |=
+					XFS_BTCUR_BPRV_WASDEL;
+		} else if (cur)
+			cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
+		/*
+		 * If it's the case where the directory code is running
+		 * with no block reservation, and the deleted block is in
+		 * the middle of its extent, and the resulting insert
+		 * of an extent would cause transformation to btree format,
+		 * then reject it.  The calling code will then swap
+		 * blocks around instead.
+		 * We have to do this now, rather than waiting for the
+		 * conversion to btree format, since the transaction
+		 * will be dirty.
+		 */
+		if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
+		    XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+		    XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max &&
+		    del.br_startoff > got.br_startoff &&
+		    del.br_startoff + del.br_blockcount <
+		    got.br_startoff + got.br_blockcount) {
+			error = XFS_ERROR(ENOSPC);
+			goto error0;
+		}
+		error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del,
+			flags, &tmp_logflags, whichfork, rsvd);
+		logflags |= tmp_logflags;
+		if (error)
+			goto error0;
+		bno = del.br_startoff - 1;
+nodelete:
+		lastx = ifp->if_lastex;
+		/*
+		 * If not done go on to the next (previous) record.
+		 * Reset ep in case the extents array was re-alloced.
+		 */
+		ep = &ifp->if_u1.if_extents[lastx];
+		if (bno != (xfs_fileoff_t)-1 && bno >= start) {
+			if (lastx >= XFS_IFORK_NEXTENTS(ip, whichfork) ||
+			    xfs_bmbt_get_startoff(ep) > bno) {
+				lastx--;
+				ep--;
+			}
+			if (lastx >= 0)
+				xfs_bmbt_get_all(ep, &got);
+			extno++;
+		}
+	}
+	ifp->if_lastex = lastx;
+	*done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+	/*
+	 * Convert to a btree if necessary.
+	 */
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+	    XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
+		ASSERT(cur == NULL);
+		error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
+			&cur, 0, &tmp_logflags, whichfork);
+		logflags |= tmp_logflags;
+		if (error)
+			goto error0;
+	}
+	/*
+	 * transform from btree to extents, give it cur
+	 */
+	else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+		 XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+		ASSERT(cur != NULL);
+		error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
+			whichfork, async);
+		logflags |= tmp_logflags;
+		if (error)
+			goto error0;
+	}
+	/*
+	 * transform from extents to local?
+	 */
+	ASSERT(ifp->if_ext_max ==
+	       XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+	error = 0;
+error0:
+	/*
+	 * Log everything.  Do this after conversion, there's no point in
+	 * logging the extent list if we've converted to btree format.
+	 */
+	if ((logflags & XFS_ILOG_FEXT(whichfork)) &&
+	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+		logflags &= ~XFS_ILOG_FEXT(whichfork);
+	else if ((logflags & XFS_ILOG_FBROOT(whichfork)) &&
+		 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
+		logflags &= ~XFS_ILOG_FBROOT(whichfork);
+	/*
+	 * Log inode even in the error case, if the transaction
+	 * is dirty we'll need to shut down the filesystem.
+	 */
+	if (logflags)
+		xfs_trans_log_inode(tp, ip, logflags);
+	if (cur) {
+		if (!error) {
+			*firstblock = cur->bc_private.b.firstblock;
+			cur->bc_private.b.allocated = 0;
+		}
+		xfs_btree_del_cursor(cur,
+			error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	}
+	return error;
+}
+
+/*
+ * Fcntl interface to xfs_bmapi.
+ */
+int						/* error code */
+xfs_getbmap(
+	bhv_desc_t		*bdp,		/* XFS behavior descriptor*/
+	struct getbmap		*bmv,		/* user bmap structure */
+	void			*ap,		/* pointer to user's array */
+	int			interface)	/* interface flags */
+{
+	__int64_t		bmvend;		/* last block requested */
+	int			error;		/* return value */
+	__int64_t		fixlen;		/* length for -1 case */
+	int			i;		/* extent number */
+	xfs_inode_t		*ip;		/* xfs incore inode pointer */
+	vnode_t			*vp;		/* corresponding vnode */
+	int			lock;		/* lock state */
+	xfs_bmbt_irec_t		*map;		/* buffer for user's data */
+	xfs_mount_t		*mp;		/* file system mount point */
+	int			nex;		/* # of user extents can do */
+	int			nexleft;	/* # of user extents left */
+	int			subnex;		/* # of bmapi's can do */
+	int			nmap;		/* number of map entries */
+	struct getbmap		out;		/* output structure */
+	int			whichfork;	/* data or attr fork */
+	int			prealloced;	/* this is a file with
+						 * preallocated data space */
+	int			sh_unwritten;	/* true, if unwritten */
+						/* extents listed seperately */
+	int			bmapi_flags;	/* flags for xfs_bmapi */
+	__int32_t		oflags;		/* getbmapx bmv_oflags field */
+
+	ip = XFS_BHVTOI(bdp);
+	vp = BHV_TO_VNODE(bdp);
+
+	whichfork = interface & BMV_IF_ATTRFORK ?
+				XFS_ATTR_FORK : XFS_DATA_FORK;
+	sh_unwritten = (interface & BMV_IF_PREALLOC) != 0;
+
+
+	/*	If the BMV_IF_NO_DMAPI_READ interface bit specified, do not
+	 *	generate a DMAPI read event.  Otherwise, if the DM_EVENT_READ
+	 *	bit is set for the file, generate a read event in order
+	 *	that the DMAPI application may do its thing before we return
+	 *	the extents.  Usually this means restoring user file data to
+	 *	regions of the file that look like holes.
+	 *
+	 *	The "old behavior" (from XFS_IOC_GETBMAP) is to not specify
+	 *	BMV_IF_NO_DMAPI_READ so that read events are generated.
+	 *	If this were not true, callers of ioctl( XFS_IOC_GETBMAP )
+	 *	could misinterpret holes in a DMAPI file as true holes,
+	 *	when in fact they may represent offline user data.
+	 */
+	if (   (interface & BMV_IF_NO_DMAPI_READ) == 0
+	    && DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ)
+	    && whichfork == XFS_DATA_FORK) {
+
+		error = xfs_dm_send_data_event(DM_EVENT_READ, bdp,
+				0, 0, 0, NULL);
+		if (error)
+			return XFS_ERROR(error);
+	}
+
+	if (whichfork == XFS_ATTR_FORK) {
+		if (XFS_IFORK_Q(ip)) {
+			if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
+			    ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
+			    ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
+				return XFS_ERROR(EINVAL);
+		} else if (ip->i_d.di_aformat != 0 &&
+			   ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
+			cmn_err(CE_NOTE,
+				"EFSCORRUPTED returned from file %s line %d",
+				__FILE__, __LINE__);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+	} else if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
+		   ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
+		   ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
+		return XFS_ERROR(EINVAL);
+
+	mp = ip->i_mount;
+
+	if (whichfork == XFS_DATA_FORK) {
+		if (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC) {
+			prealloced = 1;
+			fixlen = XFS_MAX_FILE_OFFSET;
+		} else {
+			prealloced = 0;
+			fixlen = ip->i_d.di_size;
+		}
+	} else {
+		prealloced = 0;
+		fixlen = 1LL << 32;
+	}
+
+	if (bmv->bmv_length == -1) {
+		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
+		bmv->bmv_length = MAX( (__int64_t)(fixlen - bmv->bmv_offset),
+					(__int64_t)0);
+	} else if (bmv->bmv_length < 0)
+		return XFS_ERROR(EINVAL);
+	if (bmv->bmv_length == 0) {
+		bmv->bmv_entries = 0;
+		return 0;
+	}
+
+	nex = bmv->bmv_count - 1;
+
+	if (nex <= 0)
+		return XFS_ERROR(EINVAL);
+
+	bmvend = bmv->bmv_offset + bmv->bmv_length;
+
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
+
+	if (whichfork == XFS_DATA_FORK && ip->i_delayed_blks) {
+
+		VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error);
+	}
+
+	ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0);
+
+	lock = xfs_ilock_map_shared(ip);
+
+	/*
+	 * Don't let nex be bigger than the number of extents
+	 * we can have assuming alternating holes and real extents.
+	 */
+	if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
+		nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
+
+	bmapi_flags = XFS_BMAPI_AFLAG(whichfork) |
+			((sh_unwritten) ? 0 : XFS_BMAPI_IGSTATE);
+
+	subnex = 16;	/* XXXjtk - need a #define?	*/
+
+	/*
+	 * Allocate enough space to handle "subnex" maps at a time.
+	 */
+	map = kmem_alloc(subnex * sizeof(*map), KM_SLEEP);
+
+	bmv->bmv_entries = 0;
+
+	if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0) {
+		error = 0;
+		goto unlock_and_return;
+	}
+
+	nexleft = nex;
+
+	do {
+	    if (nexleft > subnex)
+		    nmap = subnex;
+	    else
+		    nmap = nexleft;
+
+	    error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
+					XFS_BB_TO_FSB(mp, bmv->bmv_length),
+					bmapi_flags, NULL, 0,
+					map, &nmap, NULL);
+	    ASSERT(nmap <= subnex);
+
+	    if (error)
+		    goto unlock_and_return;
+
+	    for (error = i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
+		nexleft--;
+
+		oflags = 0;
+
+		out.bmv_offset = XFS_FSB_TO_BB(mp, map[i].br_startoff);
+		out.bmv_length = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
+
+		ASSERT(map[i].br_startblock != DELAYSTARTBLOCK);
+
+		if (   prealloced
+		    && map[i].br_startblock == HOLESTARTBLOCK
+		    && out.bmv_offset + out.bmv_length == bmvend) {
+			/*
+			 * came to hole at end of file
+			 */
+			goto unlock_and_return;
+		} else {
+			if (map[i].br_startblock == HOLESTARTBLOCK)
+				out.bmv_block = -1;
+			else
+				out.bmv_block =
+					XFS_FSB_TO_DB(ip, map[i].br_startblock);
+
+			/* return either a getbmap or a getbmapx structure. */
+
+			if (interface & BMV_IF_EXTENDED) {
+				struct	getbmapx	outx;
+
+				GETBMAP_CONVERT(out,outx);
+
+				outx.bmv_oflags = oflags;
+				outx.bmv_unused1 = outx.bmv_unused2 = 0;
+
+				if (copy_to_user(ap, &outx, sizeof(outx))) {
+					error = XFS_ERROR(EFAULT);
+					goto unlock_and_return;
+				}
+			} else {
+				if (copy_to_user(ap, &out, sizeof(out))) {
+					error = XFS_ERROR(EFAULT);
+					goto unlock_and_return;
+				}
+			}
+
+			bmv->bmv_offset = out.bmv_offset + out.bmv_length;
+			bmv->bmv_length = MAX( (__int64_t)0,
+					(__int64_t)(bmvend - bmv->bmv_offset) );
+
+			bmv->bmv_entries++;
+
+			if (interface & BMV_IF_EXTENDED)
+				ap = (void *)((struct getbmapx *)ap + 1);
+			else
+				ap = (void *)((struct getbmap *)ap + 1);
+		}
+	    }
+	} while (nmap && nexleft && bmv->bmv_length);
+
+unlock_and_return:
+	xfs_iunlock_map_shared(ip, lock);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+	kmem_free(map, subnex * sizeof(*map));
+
+	return error;
+}
+
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ */
+int					/* error */
+xfs_bmap_isaeof(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_fileoff_t	off,		/* file offset in fsblocks */
+	int		whichfork,	/* data or attribute fork */
+	char		*aeof)		/* return value */
+{
+	int		error;		/* error return value */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_bmbt_rec_t	*lastrec;	/* extent list entry pointer */
+	xfs_extnum_t	nextents;	/* size of extent list */
+	xfs_bmbt_irec_t s;		/* expanded extent list entry */
+
+	ASSERT(whichfork == XFS_DATA_FORK);
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(NULL, ip, whichfork)))
+		return error;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	if (nextents == 0) {
+		*aeof = 1;
+		return 0;
+	}
+	/*
+	 * Go to the last extent
+	 */
+	lastrec = &ifp->if_u1.if_extents[nextents - 1];
+	xfs_bmbt_get_all(lastrec, &s);
+	/*
+	 * Check we are allocating in the last extent (for delayed allocations)
+	 * or past the last extent for non-delayed allocations.
+	 */
+	*aeof = (off >= s.br_startoff &&
+		 off < s.br_startoff + s.br_blockcount &&
+		 ISNULLSTARTBLOCK(s.br_startblock)) ||
+		off >= s.br_startoff + s.br_blockcount;
+	return 0;
+}
+
+/*
+ * Check if the endoff is outside the last extent. If so the caller will grow
+ * the allocation to a stripe unit boundary.
+ */
+int					/* error */
+xfs_bmap_eof(
+	xfs_inode_t	*ip,		/* incore inode pointer */
+	xfs_fileoff_t	endoff,		/* file offset in fsblocks */
+	int		whichfork,	/* data or attribute fork */
+	int		*eof)		/* result value */
+{
+	xfs_fsblock_t	blockcount;	/* extent block count */
+	int		error;		/* error return value */
+	xfs_ifork_t	*ifp;		/* inode fork pointer */
+	xfs_bmbt_rec_t	*lastrec;	/* extent list entry pointer */
+	xfs_extnum_t	nextents;	/* size of extent list */
+	xfs_fileoff_t	startoff;	/* extent starting file offset */
+
+	ASSERT(whichfork == XFS_DATA_FORK);
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
+	    (error = xfs_iread_extents(NULL, ip, whichfork)))
+		return error;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	if (nextents == 0) {
+		*eof = 1;
+		return 0;
+	}
+	/*
+	 * Go to the last extent
+	 */
+	lastrec = &ifp->if_u1.if_extents[nextents - 1];
+	startoff = xfs_bmbt_get_startoff(lastrec);
+	blockcount = xfs_bmbt_get_blockcount(lastrec);
+	*eof = endoff >= startoff + blockcount;
+	return 0;
+}
+
+#ifdef XFSDEBUG
+/*
+ * Check that the extents list for the inode ip is in the right order.
+ */
+STATIC void
+xfs_bmap_check_extents(
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	int			whichfork)	/* data or attr fork */
+{
+	xfs_bmbt_rec_t		*base;		/* base of extents list */
+	xfs_bmbt_rec_t		*ep;		/* current extent entry */
+	xfs_ifork_t		*ifp;		/* inode fork pointer */
+	xfs_extnum_t		nextents;	/* number of extents in list */
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+	base = ifp->if_u1.if_extents;
+	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	for (ep = base; ep < &base[nextents - 1]; ep++) {
+		xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep,
+			(void *)(ep + 1));
+	}
+}
+
+STATIC
+xfs_buf_t *
+xfs_bmap_get_bp(
+	xfs_btree_cur_t		*cur,
+	xfs_fsblock_t		bno)
+{
+	int i;
+	xfs_buf_t *bp;
+
+	if (!cur)
+		return(NULL);
+
+	bp = NULL;
+	for(i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
+		bp = cur->bc_bufs[i];
+		if (!bp) break;
+		if (XFS_BUF_ADDR(bp) == bno)
+			break;	/* Found it */
+	}
+	if (i == XFS_BTREE_MAXLEVELS)
+		bp = NULL;
+
+	if (!bp) { /* Chase down all the log items to see if the bp is there */
+		xfs_log_item_chunk_t	*licp;
+		xfs_trans_t		*tp;
+
+		tp = cur->bc_tp;
+		licp = &tp->t_items;
+		while (!bp && licp != NULL) {
+			if (XFS_LIC_ARE_ALL_FREE(licp)) {
+				licp = licp->lic_next;
+				continue;
+			}
+			for (i = 0; i < licp->lic_unused; i++) {
+				xfs_log_item_desc_t	*lidp;
+				xfs_log_item_t		*lip;
+				xfs_buf_log_item_t	*bip;
+				xfs_buf_t		*lbp;
+
+				if (XFS_LIC_ISFREE(licp, i)) {
+					continue;
+				}
+
+				lidp = XFS_LIC_SLOT(licp, i);
+				lip = lidp->lid_item;
+				if (lip->li_type != XFS_LI_BUF)
+					continue;
+
+				bip = (xfs_buf_log_item_t *)lip;
+				lbp = bip->bli_buf;
+
+				if (XFS_BUF_ADDR(lbp) == bno) {
+					bp = lbp;
+					break; /* Found it */
+				}
+			}
+			licp = licp->lic_next;
+		}
+	}
+	return(bp);
+}
+
+void
+xfs_check_block(
+	xfs_bmbt_block_t	*block,
+	xfs_mount_t		*mp,
+	int			root,
+	short			sz)
+{
+	int			i, j, dmxr;
+	xfs_bmbt_ptr_t		*pp, *thispa;	/* pointer to block address */
+	xfs_bmbt_key_t		*prevp, *keyp;
+
+	ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0);
+
+	prevp = NULL;
+	for( i = 1; i <= INT_GET(block->bb_numrecs, ARCH_CONVERT);i++) {
+		dmxr = mp->m_bmap_dmxr[0];
+
+		if (root) {
+			keyp = XFS_BMAP_BROOT_KEY_ADDR(block, i, sz);
+		} else {
+			keyp = XFS_BTREE_KEY_ADDR(mp->m_sb.sb_blocksize,
+				xfs_bmbt, block, i, dmxr);
+		}
+
+		if (prevp) {
+			xfs_btree_check_key(XFS_BTNUM_BMAP, prevp, keyp);
+		}
+		prevp = keyp;
+
+		/*
+		 * Compare the block numbers to see if there are dups.
+		 */
+
+		if (root) {
+			pp = XFS_BMAP_BROOT_PTR_ADDR(block, i, sz);
+		} else {
+			pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize,
+				xfs_bmbt, block, i, dmxr);
+		}
+		for (j = i+1; j <= INT_GET(block->bb_numrecs, ARCH_CONVERT); j++) {
+			if (root) {
+				thispa = XFS_BMAP_BROOT_PTR_ADDR(block, j, sz);
+			} else {
+				thispa = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize,
+					xfs_bmbt, block, j, dmxr);
+			}
+			if (INT_GET(*thispa, ARCH_CONVERT) == INT_GET(*pp, ARCH_CONVERT)) {
+				printk("xfs_check_block: thispa(%d) == pp(%d) %Ld\n",
+						j, i, INT_GET(*thispa, ARCH_CONVERT));
+				panic("xfs_check_block: ptrs are equal in node\n");
+			}
+		}
+	}
+}
+
+/*
+ * Check that the extents for the inode ip are in the right order in all
+ * btree leaves.
+ */
+
+STATIC void
+xfs_bmap_check_leaf_extents(
+	xfs_btree_cur_t		*cur,	/* btree cursor or null */
+	xfs_inode_t		*ip,		/* incore inode pointer */
+	int			whichfork)	/* data or attr fork */
+{
+	xfs_bmbt_block_t	*block; /* current btree block */
+	xfs_fsblock_t		bno;	/* block # of "block" */
+	xfs_buf_t		*bp;	/* buffer for "block" */
+	int			error;	/* error return value */
+	xfs_extnum_t		i=0;	/* index into the extents list */
+	xfs_ifork_t		*ifp;	/* fork structure */
+	int			level;	/* btree level, for checking */
+	xfs_mount_t		*mp;	/* file system mount structure */
+	xfs_bmbt_ptr_t		*pp;	/* pointer to block address */
+	xfs_bmbt_rec_t		*ep, *lastp;	/* extent pointers in block entry */
+	int			bp_release = 0;
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) {
+		return;
+	}
+
+	bno = NULLFSBLOCK;
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	block = ifp->if_broot;
+	/*
+	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+	 */
+	ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0);
+	level = INT_GET(block->bb_level, ARCH_CONVERT);
+	xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
+	pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+	ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO);
+	ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks);
+	bno = INT_GET(*pp, ARCH_CONVERT);
+	/*
+	 * Go down the tree until leaf level is reached, following the first
+	 * pointer (leftmost) at each level.
+	 */
+	while (level-- > 0) {
+		/* See if buf is in cur first */
+		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+		if (bp) {
+			bp_release = 0;
+		} else {
+			bp_release = 1;
+		}
+		if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+				XFS_BMAP_BTREE_REF)))
+			goto error_norelse;
+		block = XFS_BUF_TO_BMBT_BLOCK(bp);
+		XFS_WANT_CORRUPTED_GOTO(
+			XFS_BMAP_SANITY_CHECK(mp, block, level),
+			error0);
+		if (level == 0)
+			break;
+
+		/*
+		 * Check this block for basic sanity (increasing keys and
+		 * no duplicate blocks).
+		 */
+
+		xfs_check_block(block, mp, 0, 0);
+		pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block,
+			1, mp->m_bmap_dmxr[1]);
+		XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)), error0);
+		bno = INT_GET(*pp, ARCH_CONVERT);
+		if (bp_release) {
+			bp_release = 0;
+			xfs_trans_brelse(NULL, bp);
+		}
+	}
+
+	/*
+	 * Here with bp and block set to the leftmost leaf node in the tree.
+	 */
+	i = 0;
+
+	/*
+	 * Loop over all leaf nodes checking that all extents are in the right order.
+	 */
+	lastp = NULL;
+	for (;;) {
+		xfs_bmbt_rec_t	*frp;
+		xfs_fsblock_t	nextbno;
+		xfs_extnum_t	num_recs;
+
+
+		num_recs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+
+		/*
+		 * Read-ahead the next leaf block, if any.
+		 */
+
+		nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
+
+		/*
+		 * Check all the extents to make sure they are OK.
+		 * If we had a previous block, the last entry should
+		 * conform with the first entry in this one.
+		 */
+
+		frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt,
+			block, 1, mp->m_bmap_dmxr[0]);
+
+		for (ep = frp;ep < frp + (num_recs - 1); ep++) {
+			if (lastp) {
+				xfs_btree_check_rec(XFS_BTNUM_BMAP,
+					(void *)lastp, (void *)ep);
+			}
+			xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep,
+				(void *)(ep + 1));
+		}
+		lastp = frp + num_recs - 1; /* For the next iteration */
+
+		i += num_recs;
+		if (bp_release) {
+			bp_release = 0;
+			xfs_trans_brelse(NULL, bp);
+		}
+		bno = nextbno;
+		/*
+		 * If we've reached the end, stop.
+		 */
+		if (bno == NULLFSBLOCK)
+			break;
+
+		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
+		if (bp) {
+			bp_release = 0;
+		} else {
+			bp_release = 1;
+		}
+		if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+				XFS_BMAP_BTREE_REF)))
+			goto error_norelse;
+		block = XFS_BUF_TO_BMBT_BLOCK(bp);
+	}
+	if (bp_release) {
+		bp_release = 0;
+		xfs_trans_brelse(NULL, bp);
+	}
+	return;
+
+error0:
+	printk("at error0\n");
+	if (bp_release)
+		xfs_trans_brelse(NULL, bp);
+error_norelse:
+	printk("xfs_bmap_check_leaf_extents: BAD after btree leaves for %d extents\n", i);
+	panic("xfs_bmap_check_leaf_extents: CORRUPTED BTREE OR SOMETHING");
+	return;
+}
+#endif
+
+/*
+ * Count fsblocks of the given fork.
+ */
+int						/* error */
+xfs_bmap_count_blocks(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*ip,		/* incore inode */
+	int			whichfork,	/* data or attr fork */
+	int			*count)		/* out: count of blocks */
+{
+	xfs_bmbt_block_t	*block; /* current btree block */
+	xfs_fsblock_t		bno;	/* block # of "block" */
+	xfs_ifork_t		*ifp;	/* fork structure */
+	int			level;	/* btree level, for checking */
+	xfs_mount_t		*mp;	/* file system mount structure */
+	xfs_bmbt_ptr_t		*pp;	/* pointer to block address */
+
+	bno = NULLFSBLOCK;
+	mp = ip->i_mount;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
+		if (xfs_bmap_count_leaves(ifp->if_u1.if_extents,
+			ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
+			count) < 0) {
+				cmn_err(CE_NOTE,
+					"EFSCORRUPTED returned from file %s line %d",
+					__FILE__, __LINE__);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+		return 0;
+	}
+
+	/*
+	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
+	 */
+	block = ifp->if_broot;
+	ASSERT(INT_GET(block->bb_level, ARCH_CONVERT) > 0);
+	level = INT_GET(block->bb_level, ARCH_CONVERT);
+	pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+	ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO);
+	ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks);
+	bno = INT_GET(*pp, ARCH_CONVERT);
+
+	if (xfs_bmap_count_tree(mp, tp, bno, level, count) < 0) {
+		cmn_err(CE_NOTE,
+			"EFSCORRUPTED returned from file %s line %d",
+			__FILE__, __LINE__);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	return 0;
+}
+
+/*
+ * Recursively walks each level of a btree
+ * to count total fsblocks is use.
+ */
+int					/* error */
+xfs_bmap_count_tree(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_fsblock_t	blockno,	/* file system block number */
+	int		levelin,	/* level in btree */
+	int		*count)		/* Count of blocks */
+{
+	int			error;
+	xfs_buf_t		*bp, *nbp;
+	int			level = levelin;
+	xfs_bmbt_ptr_t		*pp;
+	xfs_fsblock_t		bno = blockno;
+	xfs_fsblock_t		nextbno;
+	xfs_bmbt_block_t	*block, *nextblock;
+	int			numrecs;
+	xfs_bmbt_rec_t		*frp;
+
+	if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF)))
+		return error;
+	*count += 1;
+	block = XFS_BUF_TO_BMBT_BLOCK(bp);
+
+	if (--level) {
+		/* Not at node above leafs, count this level of nodes */
+		nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
+		while (nextbno != NULLFSBLOCK) {
+			if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
+				0, &nbp, XFS_BMAP_BTREE_REF)))
+				return error;
+			*count += 1;
+			nextblock = XFS_BUF_TO_BMBT_BLOCK(nbp);
+			nextbno = INT_GET(nextblock->bb_rightsib, ARCH_CONVERT);
+			xfs_trans_brelse(tp, nbp);
+		}
+
+		/* Dive to the next level */
+		pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize,
+			xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
+		bno = INT_GET(*pp, ARCH_CONVERT);
+		if ((error =
+		     xfs_bmap_count_tree(mp, tp, bno, level, count)) < 0) {
+			xfs_trans_brelse(tp, bp);
+			cmn_err(CE_NOTE,
+				"EFSCORRUPTED returned from file %s line %d",
+				__FILE__, __LINE__);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+		xfs_trans_brelse(tp, bp);
+	} else {
+		/* count all level 1 nodes and their leaves */
+		for (;;) {
+			nextbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
+			numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+			frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize,
+				xfs_bmbt, block, 1, mp->m_bmap_dmxr[0]);
+			if (xfs_bmap_count_leaves(frp, numrecs, count) < 0) {
+				xfs_trans_brelse(tp, bp);
+				cmn_err(CE_NOTE,
+					"EFSCORRUPTED returned from file %s line %d",
+					__FILE__, __LINE__);
+				return XFS_ERROR(EFSCORRUPTED);
+			}
+			xfs_trans_brelse(tp, bp);
+			if (nextbno == NULLFSBLOCK)
+				break;
+			bno = nextbno;
+			if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+				XFS_BMAP_BTREE_REF)))
+				return error;
+			*count += 1;
+			block = XFS_BUF_TO_BMBT_BLOCK(bp);
+		}
+	}
+	return 0;
+}
+
+/*
+ * Count leaf blocks given a pointer to an extent list.
+ */
+int
+xfs_bmap_count_leaves(
+	xfs_bmbt_rec_t		*frp,
+	int			numrecs,
+	int			*count)
+{
+	int		b;
+
+	for ( b = 1; b <= numrecs; b++, frp++)
+		*count += xfs_bmbt_get_blockcount(frp);
+	return 0;
+}
+
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
new file mode 100644
index 000000000000..cb0dc44fb9df
--- /dev/null
+++ b/fs/xfs/xfs_bmap.h
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_BMAP_H__
+#define __XFS_BMAP_H__
+
+struct getbmap;
+struct xfs_bmbt_irec;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * List of extents to be free "later".
+ * The list is kept sorted on xbf_startblock.
+ */
+typedef struct xfs_bmap_free_item
+{
+	xfs_fsblock_t		xbfi_startblock;/* starting fs block number */
+	xfs_extlen_t		xbfi_blockcount;/* number of blocks in extent */
+	struct xfs_bmap_free_item *xbfi_next;	/* link to next entry */
+} xfs_bmap_free_item_t;
+
+/*
+ * Header for free extent list.
+ */
+typedef struct xfs_bmap_free
+{
+	xfs_bmap_free_item_t	*xbf_first;	/* list of to-be-free extents */
+	int			xbf_count;	/* count of items on list */
+	int			xbf_low;	/* kludge: alloc in low mode */
+} xfs_bmap_free_t;
+
+#define XFS_BMAP_MAX_NMAP	4
+
+/*
+ * Flags for xfs_bmapi
+ */
+#define XFS_BMAPI_WRITE		0x001	/* write operation: allocate space */
+#define XFS_BMAPI_DELAY		0x002	/* delayed write operation */
+#define XFS_BMAPI_ENTIRE	0x004	/* return entire extent, not trimmed */
+#define XFS_BMAPI_METADATA	0x008	/* mapping metadata not user data */
+#define XFS_BMAPI_EXACT		0x010	/* allocate only to spec'd bounds */
+#define XFS_BMAPI_ATTRFORK	0x020	/* use attribute fork not data */
+#define XFS_BMAPI_ASYNC		0x040	/* bunmapi xactions can be async */
+#define XFS_BMAPI_RSVBLOCKS	0x080	/* OK to alloc. reserved data blocks */
+#define XFS_BMAPI_PREALLOC	0x100	/* preallocation op: unwritten space */
+#define XFS_BMAPI_IGSTATE	0x200	/* Ignore state - */
+					/* combine contig. space */
+#define XFS_BMAPI_CONTIG	0x400	/* must allocate only one extent */
+#define XFS_BMAPI_DIRECT_IO	0x800	/* Flag from cxfs client, not used
+					 * by xfs directly. Indicates alloc
+					 * request is for direct I/O not
+					 * extent conversion by server */
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAPI_AFLAG)
+int xfs_bmapi_aflag(int w);
+#define XFS_BMAPI_AFLAG(w)	xfs_bmapi_aflag(w)
+#else
+#define XFS_BMAPI_AFLAG(w)	((w) == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0)
+#endif
+
+/*
+ * Special values for xfs_bmbt_irec_t br_startblock field.
+ */
+#define DELAYSTARTBLOCK		((xfs_fsblock_t)-1LL)
+#define HOLESTARTBLOCK		((xfs_fsblock_t)-2LL)
+
+/*
+ * Trace operations for bmap extent tracing
+ */
+#define XFS_BMAP_KTRACE_DELETE	1
+#define XFS_BMAP_KTRACE_INSERT	2
+#define XFS_BMAP_KTRACE_PRE_UP	3
+#define XFS_BMAP_KTRACE_POST_UP 4
+
+#define XFS_BMAP_TRACE_SIZE	4096	/* size of global trace buffer */
+#define XFS_BMAP_KTRACE_SIZE	32	/* size of per-inode trace buffer */
+
+#if defined(XFS_ALL_TRACE)
+#define XFS_BMAP_TRACE
+#endif
+
+#if !defined(DEBUG)
+#undef	XFS_BMAP_TRACE
+#endif
+
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_INIT)
+void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp);
+#define XFS_BMAP_INIT(flp,fbp)	xfs_bmap_init(flp,fbp)
+#else
+#define XFS_BMAP_INIT(flp,fbp)	\
+	((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
+	 (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK)
+#endif
+
+/*
+ * Argument structure for xfs_bmap_alloc.
+ */
+typedef struct xfs_bmalloca {
+	xfs_fsblock_t		firstblock; /* i/o first block allocated */
+	xfs_fsblock_t		rval;	/* starting block of new extent */
+	xfs_fileoff_t		off;	/* offset in file filling in */
+	struct xfs_trans	*tp;	/* transaction pointer */
+	struct xfs_inode	*ip;	/* incore inode pointer */
+	struct xfs_bmbt_irec	*prevp; /* extent before the new one */
+	struct xfs_bmbt_irec	*gotp;	/* extent after, or delayed */
+	xfs_extlen_t		alen;	/* i/o length asked/allocated */
+	xfs_extlen_t		total;	/* total blocks needed for xaction */
+	xfs_extlen_t		minlen; /* mininum allocation size (blocks) */
+	xfs_extlen_t		minleft; /* amount must be left after alloc */
+	char			eof;	/* set if allocating past last extent */
+	char			wasdel; /* replacing a delayed allocation */
+	char			userdata;/* set if is user data */
+	char			low;	/* low on space, using seq'l ags */
+	char			aeof;	/* allocated space at eof */
+} xfs_bmalloca_t;
+
+#ifdef __KERNEL__
+/*
+ * Convert inode from non-attributed to attributed.
+ * Must not be in a transaction, ip must not be locked.
+ */
+int					/* error code */
+xfs_bmap_add_attrfork(
+	struct xfs_inode	*ip,	/* incore inode pointer */
+	int					rsvd);	/* flag for reserved block allocation */
+
+/*
+ * Add the extent to the list of extents to be free at transaction end.
+ * The list is maintained sorted (by block number).
+ */
+void
+xfs_bmap_add_free(
+	xfs_fsblock_t		bno,		/* fs block number of extent */
+	xfs_filblks_t		len,		/* length of extent */
+	xfs_bmap_free_t		*flist,		/* list of extents */
+	struct xfs_mount	*mp);		/* mount point structure */
+
+/*
+ * Routine to clean up the free list data structure when
+ * an error occurs during a transaction.
+ */
+void
+xfs_bmap_cancel(
+	xfs_bmap_free_t		*flist);	/* free list to clean up */
+
+/*
+ * Routine to check if a specified inode is swap capable.
+ */
+int
+xfs_bmap_check_swappable(
+	struct xfs_inode	*ip);		/* incore inode */
+
+/*
+ * Compute and fill in the value of the maximum depth of a bmap btree
+ * in this filesystem.	Done once, during mount.
+ */
+void
+xfs_bmap_compute_maxlevels(
+	struct xfs_mount	*mp,	/* file system mount structure */
+	int			whichfork);	/* data or attr fork */
+
+/*
+ * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
+ * caller.  Frees all the extents that need freeing, which must be done
+ * last due to locking considerations.
+ *
+ * Return 1 if the given transaction was committed and a new one allocated,
+ * and 0 otherwise.
+ */
+int						/* error */
+xfs_bmap_finish(
+	struct xfs_trans	**tp,		/* transaction pointer addr */
+	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
+	xfs_fsblock_t		firstblock,	/* controlled a.g. for allocs */
+	int			*committed);	/* xact committed or not */
+
+/*
+ * Returns the file-relative block number of the first unused block in the file.
+ * This is the lowest-address hole if the file has holes, else the first block
+ * past the end of file.
+ */
+int						/* error */
+xfs_bmap_first_unused(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_extlen_t		len,		/* size of hole to find */
+	xfs_fileoff_t		*unused,	/* unused block num */
+	int			whichfork);	/* data or attr fork */
+
+/*
+ * Returns the file-relative block number of the last block + 1 before
+ * last_block (input value) in the file.
+ * This is not based on i_size, it is based on the extent list.
+ * Returns 0 for local files, as they do not have an extent list.
+ */
+int						/* error */
+xfs_bmap_last_before(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		*last_block,	/* last block */
+	int			whichfork);	/* data or attr fork */
+
+/*
+ * Returns the file-relative block number of the first block past eof in
+ * the file.  This is not based on i_size, it is based on the extent list.
+ * Returns 0 for local files, as they do not have an extent list.
+ */
+int						/* error */
+xfs_bmap_last_offset(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		*unused,	/* last block num */
+	int			whichfork);	/* data or attr fork */
+
+/*
+ * Returns whether the selected fork of the inode has exactly one
+ * block or not.  For the data fork we check this matches di_size,
+ * implying the file's range is 0..bsize-1.
+ */
+int
+xfs_bmap_one_block(
+	struct xfs_inode	*ip,		/* incore inode */
+	int			whichfork);	/* data or attr fork */
+
+/*
+ * Read in the extents to iu_extents.
+ * All inode fields are set up by caller, we just traverse the btree
+ * and copy the records in.
+ */
+int						/* error */
+xfs_bmap_read_extents(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	int			whichfork);	/* data or attr fork */
+
+#if defined(XFS_BMAP_TRACE)
+/*
+ * Add bmap trace insert entries for all the contents of the extent list.
+ */
+void
+xfs_bmap_trace_exlist(
+	char			*fname,		/* function name */
+	struct xfs_inode	*ip,		/* incore inode pointer */
+	xfs_extnum_t		cnt,		/* count of entries in list */
+	int			whichfork);	/* data or attr fork */
+#else
+#define xfs_bmap_trace_exlist(f,ip,c,w)
+#endif
+
+/*
+ * Map file blocks to filesystem blocks.
+ * File range is given by the bno/len pair.
+ * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set)
+ * into a hole or past eof.
+ * Only allocates blocks from a single allocation group,
+ * to avoid locking problems.
+ * The returned value in "firstblock" from the first call in a transaction
+ * must be remembered and presented to subsequent calls in "firstblock".
+ * An upper bound for the number of blocks to be allocated is supplied to
+ * the first call in "total"; if no allocation group has that many free
+ * blocks then the call will fail (return NULLFSBLOCK in "firstblock").
+ */
+int						/* error */
+xfs_bmapi(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		bno,		/* starting file offs. mapped */
+	xfs_filblks_t		len,		/* length to map in file */
+	int			flags,		/* XFS_BMAPI_... */
+	xfs_fsblock_t		*firstblock,	/* first allocated block
+						   controls a.g. for allocs */
+	xfs_extlen_t		total,		/* total blocks needed */
+	struct xfs_bmbt_irec	*mval,		/* output: map values */
+	int			*nmap,		/* i/o: mval size/count */
+	xfs_bmap_free_t		*flist);	/* i/o: list extents to free */
+
+/*
+ * Map file blocks to filesystem blocks, simple version.
+ * One block only, read-only.
+ * For flags, only the XFS_BMAPI_ATTRFORK flag is examined.
+ * For the other flag values, the effect is as if XFS_BMAPI_METADATA
+ * was set and all the others were clear.
+ */
+int						/* error */
+xfs_bmapi_single(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	int			whichfork,	/* data or attr fork */
+	xfs_fsblock_t		*fsb,		/* output: mapped block */
+	xfs_fileoff_t		bno);		/* starting file offs. mapped */
+
+/*
+ * Unmap (remove) blocks from a file.
+ * If nexts is nonzero then the number of extents to remove is limited to
+ * that value.	If not all extents in the block range can be removed then
+ * *done is set.
+ */
+int						/* error */
+xfs_bunmapi(
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		bno,		/* starting offset to unmap */
+	xfs_filblks_t		len,		/* length to unmap in file */
+	int			flags,		/* XFS_BMAPI_... */
+	xfs_extnum_t		nexts,		/* number of extents max */
+	xfs_fsblock_t		*firstblock,	/* first allocated block
+						   controls a.g. for allocs */
+	xfs_bmap_free_t		*flist,		/* i/o: list extents to free */
+	int			*done);		/* set if not done yet */
+
+/*
+ * Fcntl interface to xfs_bmapi.
+ */
+int						/* error code */
+xfs_getbmap(
+	bhv_desc_t		*bdp,		/* XFS behavior descriptor*/
+	struct getbmap		*bmv,		/* user bmap structure */
+	void			*ap,		/* pointer to user's array */
+	int			iflags);	/* interface flags */
+
+/*
+ * Check the last inode extent to determine whether this allocation will result
+ * in blocks being allocated at the end of the file. When we allocate new data
+ * blocks at the end of the file which do not start at the previous data block,
+ * we will try to align the new blocks at stripe unit boundaries.
+ */
+int
+xfs_bmap_isaeof(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		off,
+	int			whichfork,
+	char			*aeof);
+
+/*
+ * Check if the endoff is outside the last extent. If so the caller will grow
+ * the allocation to a stripe unit boundary
+ */
+int
+xfs_bmap_eof(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		endoff,
+	int			whichfork,
+	int			*eof);
+
+/*
+ * Count fsblocks of the given fork.
+ */
+int
+xfs_bmap_count_blocks(
+	xfs_trans_t		*tp,
+	xfs_inode_t		*ip,
+	int			whichfork,
+	int			*count);
+
+/*
+ * Check an extent list, which has just been read, for
+ * any bit in the extent flag field.
+ */
+int
+xfs_check_nostate_extents(
+	xfs_bmbt_rec_t		*ep,
+	xfs_extnum_t		num);
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
new file mode 100644
index 000000000000..5b384f4f9cba
--- /dev/null
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -0,0 +1,2634 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+#ifdef DEBUG
+ktrace_t	*xfs_bmbt_trace_buf;
+#endif
+
+/*
+ * Prototypes for internal btree functions.
+ */
+
+
+STATIC int xfs_bmbt_killroot(xfs_btree_cur_t *, int);
+STATIC void xfs_bmbt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC void xfs_bmbt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC int xfs_bmbt_lshift(xfs_btree_cur_t *, int, int *);
+STATIC int xfs_bmbt_rshift(xfs_btree_cur_t *, int, int *);
+STATIC int xfs_bmbt_split(xfs_btree_cur_t *, int, xfs_fsblock_t *,
+		xfs_bmbt_key_t *, xfs_btree_cur_t **, int *);
+STATIC int xfs_bmbt_updkey(xfs_btree_cur_t *, xfs_bmbt_key_t *, int);
+
+
+#if defined(XFS_BMBT_TRACE)
+/*
+ * Add a trace buffer entry for the arguments given to the routine,
+ * generic form.
+ */
+STATIC void
+xfs_bmbt_trace_enter(
+	char		*func,
+	xfs_btree_cur_t *cur,
+	char		*s,
+	int		type,
+	int		line,
+	__psunsigned_t	a0,
+	__psunsigned_t	a1,
+	__psunsigned_t	a2,
+	__psunsigned_t	a3,
+	__psunsigned_t	a4,
+	__psunsigned_t	a5,
+	__psunsigned_t	a6,
+	__psunsigned_t	a7,
+	__psunsigned_t	a8,
+	__psunsigned_t	a9,
+	__psunsigned_t	a10)
+{
+	xfs_inode_t	*ip;
+	int		whichfork;
+
+	ip = cur->bc_private.b.ip;
+	whichfork = cur->bc_private.b.whichfork;
+	ktrace_enter(xfs_bmbt_trace_buf,
+		(void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
+		(void *)func, (void *)s, (void *)ip, (void *)cur,
+		(void *)a0, (void *)a1, (void *)a2, (void *)a3,
+		(void *)a4, (void *)a5, (void *)a6, (void *)a7,
+		(void *)a8, (void *)a9, (void *)a10);
+	ASSERT(ip->i_btrace);
+	ktrace_enter(ip->i_btrace,
+		(void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
+		(void *)func, (void *)s, (void *)ip, (void *)cur,
+		(void *)a0, (void *)a1, (void *)a2, (void *)a3,
+		(void *)a4, (void *)a5, (void *)a6, (void *)a7,
+		(void *)a8, (void *)a9, (void *)a10);
+}
+/*
+ * Add a trace buffer entry for arguments, for a buffer & 1 integer arg.
+ */
+STATIC void
+xfs_bmbt_trace_argbi(
+	char		*func,
+	xfs_btree_cur_t *cur,
+	xfs_buf_t	*b,
+	int		i,
+	int		line)
+{
+	xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBI, line,
+		(__psunsigned_t)b, i, 0, 0,
+		0, 0, 0, 0,
+		0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for a buffer & 2 integer args.
+ */
+STATIC void
+xfs_bmbt_trace_argbii(
+	char		*func,
+	xfs_btree_cur_t *cur,
+	xfs_buf_t	*b,
+	int		i0,
+	int		i1,
+	int		line)
+{
+	xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBII, line,
+		(__psunsigned_t)b, i0, i1, 0,
+		0, 0, 0, 0,
+		0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for 3 block-length args
+ * and an integer arg.
+ */
+STATIC void
+xfs_bmbt_trace_argfffi(
+	char			*func,
+	xfs_btree_cur_t		*cur,
+	xfs_dfiloff_t		o,
+	xfs_dfsbno_t		b,
+	xfs_dfilblks_t		i,
+	int			j,
+	int			line)
+{
+	xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGFFFI, line,
+		o >> 32, (int)o, b >> 32, (int)b,
+		i >> 32, (int)i, (int)j, 0,
+		0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for one integer arg.
+ */
+STATIC void
+xfs_bmbt_trace_argi(
+	char		*func,
+	xfs_btree_cur_t *cur,
+	int		i,
+	int		line)
+{
+	xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGI, line,
+		i, 0, 0, 0,
+		0, 0, 0, 0,
+		0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for int, fsblock, key.
+ */
+STATIC void
+xfs_bmbt_trace_argifk(
+	char			*func,
+	xfs_btree_cur_t		*cur,
+	int			i,
+	xfs_fsblock_t		f,
+	xfs_bmbt_key_t		*k,
+	int			line)
+{
+	xfs_dfsbno_t		d;
+	xfs_dfiloff_t		o;
+
+	d = (xfs_dfsbno_t)f;
+	o = INT_GET(k->br_startoff, ARCH_CONVERT);
+	xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
+		i, d >> 32, (int)d, o >> 32,
+		(int)o, 0, 0, 0,
+		0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for int, fsblock, rec.
+ */
+STATIC void
+xfs_bmbt_trace_argifr(
+	char			*func,
+	xfs_btree_cur_t		*cur,
+	int			i,
+	xfs_fsblock_t		f,
+	xfs_bmbt_rec_t		*r,
+	int			line)
+{
+	xfs_dfsbno_t		b;
+	xfs_dfilblks_t		c;
+	xfs_dfsbno_t		d;
+	xfs_dfiloff_t		o;
+	xfs_bmbt_irec_t		s;
+
+	d = (xfs_dfsbno_t)f;
+	xfs_bmbt_get_all(r, &s);
+	o = (xfs_dfiloff_t)s.br_startoff;
+	b = (xfs_dfsbno_t)s.br_startblock;
+	c = s.br_blockcount;
+	xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFR, line,
+		i, d >> 32, (int)d, o >> 32,
+		(int)o, b >> 32, (int)b, c >> 32,
+		(int)c, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for int, key.
+ */
+STATIC void
+xfs_bmbt_trace_argik(
+	char			*func,
+	xfs_btree_cur_t		*cur,
+	int			i,
+	xfs_bmbt_key_t		*k,
+	int			line)
+{
+	xfs_dfiloff_t		o;
+
+	o = INT_GET(k->br_startoff, ARCH_CONVERT);
+	xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
+		i, o >> 32, (int)o, 0,
+		0, 0, 0, 0,
+		0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for the cursor/operation.
+ */
+STATIC void
+xfs_bmbt_trace_cursor(
+	char		*func,
+	xfs_btree_cur_t *cur,
+	char		*s,
+	int		line)
+{
+	xfs_bmbt_rec_t	r;
+
+	xfs_bmbt_set_all(&r, &cur->bc_rec.b);
+	xfs_bmbt_trace_enter(func, cur, s, XFS_BMBT_KTRACE_CUR, line,
+		(cur->bc_nlevels << 24) | (cur->bc_private.b.flags << 16) |
+		cur->bc_private.b.allocated,
+#if BMBT_USE_64
+		INT_GET(r.l0, ARCH_CONVERT) >> 32, (int)INT_GET(r.l0, ARCH_CONVERT), INT_GET(r.l1, ARCH_CONVERT) >> 32, (int)INT_GET(r.l1, ARCH_CONVERT),
+#else
+		INT_GET(r.l0, ARCH_CONVERT), INT_GET(r.l1, ARCH_CONVERT), INT_GET(r.l2, ARCH_CONVERT), INT_GET(r.l3, ARCH_CONVERT),
+#endif
+		(unsigned long)cur->bc_bufs[0], (unsigned long)cur->bc_bufs[1],
+		(unsigned long)cur->bc_bufs[2], (unsigned long)cur->bc_bufs[3],
+		(cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1],
+		(cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]);
+}
+
+#define XFS_BMBT_TRACE_ARGBI(c,b,i)	\
+	xfs_bmbt_trace_argbi(fname, c, b, i, __LINE__)
+#define XFS_BMBT_TRACE_ARGBII(c,b,i,j)	\
+	xfs_bmbt_trace_argbii(fname, c, b, i, j, __LINE__)
+#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)	\
+	xfs_bmbt_trace_argfffi(fname, c, o, b, i, j, __LINE__)
+#define XFS_BMBT_TRACE_ARGI(c,i)	\
+	xfs_bmbt_trace_argi(fname, c, i, __LINE__)
+#define XFS_BMBT_TRACE_ARGIFK(c,i,f,k)	\
+	xfs_bmbt_trace_argifk(fname, c, i, f, k, __LINE__)
+#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r)	\
+	xfs_bmbt_trace_argifr(fname, c, i, f, r, __LINE__)
+#define XFS_BMBT_TRACE_ARGIK(c,i,k)	\
+	xfs_bmbt_trace_argik(fname, c, i, k, __LINE__)
+#define XFS_BMBT_TRACE_CURSOR(c,s)	\
+	xfs_bmbt_trace_cursor(fname, c, s, __LINE__)
+static char	ARGS[] = "args";
+static char	ENTRY[] = "entry";
+static char	ERROR[] = "error";
+#undef EXIT
+static char	EXIT[] = "exit";
+#else
+#define XFS_BMBT_TRACE_ARGBI(c,b,i)
+#define XFS_BMBT_TRACE_ARGBII(c,b,i,j)
+#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)
+#define XFS_BMBT_TRACE_ARGI(c,i)
+#define XFS_BMBT_TRACE_ARGIFK(c,i,f,k)
+#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r)
+#define XFS_BMBT_TRACE_ARGIK(c,i,k)
+#define XFS_BMBT_TRACE_CURSOR(c,s)
+#endif	/* XFS_BMBT_TRACE */
+
+
+/*
+ * Internal functions.
+ */
+
+/*
+ * Delete record pointed to by cur/level.
+ */
+STATIC int					/* error */
+xfs_bmbt_delrec(
+	xfs_btree_cur_t		*cur,
+	int			level,
+	int			async,		/* deletion can be async */
+	int			*stat)		/* success/failure */
+{
+	xfs_bmbt_block_t	*block;		/* bmap btree block */
+	xfs_fsblock_t		bno;		/* fs-relative block number */
+	xfs_buf_t		*bp;		/* buffer for block */
+	int			error;		/* error return value */
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_delrec";
+#endif
+	int			i;		/* loop counter */
+	int			j;		/* temp state */
+	xfs_bmbt_key_t		key;		/* bmap btree key */
+	xfs_bmbt_key_t		*kp=NULL;	/* pointer to bmap btree key */
+	xfs_fsblock_t		lbno;		/* left sibling block number */
+	xfs_buf_t		*lbp;		/* left buffer pointer */
+	xfs_bmbt_block_t	*left;		/* left btree block */
+	xfs_bmbt_key_t		*lkp;		/* left btree key */
+	xfs_bmbt_ptr_t		*lpp;		/* left address pointer */
+	int			lrecs=0;	/* left record count */
+	xfs_bmbt_rec_t		*lrp;		/* left record pointer */
+	xfs_mount_t		*mp;		/* file system mount point */
+	xfs_bmbt_ptr_t		*pp;		/* pointer to bmap block addr */
+	int			ptr;		/* key/record index */
+	xfs_fsblock_t		rbno;		/* right sibling block number */
+	xfs_buf_t		*rbp;		/* right buffer pointer */
+	xfs_bmbt_block_t	*right;		/* right btree block */
+	xfs_bmbt_key_t		*rkp;		/* right btree key */
+	xfs_bmbt_rec_t		*rp;		/* pointer to bmap btree rec */
+	xfs_bmbt_ptr_t		*rpp;		/* right address pointer */
+	xfs_bmbt_block_t	*rrblock;	/* right-right btree block */
+	xfs_buf_t		*rrbp;		/* right-right buffer pointer */
+	int			rrecs=0;	/* right record count */
+	xfs_bmbt_rec_t		*rrp;		/* right record pointer */
+	xfs_btree_cur_t		*tcur;		/* temporary btree cursor */
+	int			numrecs;	/* temporary numrec count */
+	int			numlrecs, numrrecs;
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGI(cur, level);
+	ptr = cur->bc_ptrs[level];
+	tcur = (xfs_btree_cur_t *)0;
+	if (ptr == 0) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	block = xfs_bmbt_get_block(cur, level, &bp);
+	numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		goto error0;
+	}
+#endif
+	if (ptr > numrecs) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	XFS_STATS_INC(xfsstats.xs_bmbt_delrec);
+	if (level > 0) {
+		kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
+		pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
+#ifdef DEBUG
+		for (i = ptr; i < numrecs; i++) {
+			if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				goto error0;
+			}
+		}
+#endif
+		if (ptr < numrecs) {
+			ovbcopy(&kp[ptr], &kp[ptr - 1],
+				(numrecs - ptr) * sizeof(*kp));
+			ovbcopy(&pp[ptr], &pp[ptr - 1], /* INT_: direct copy */
+				(numrecs - ptr) * sizeof(*pp));
+			xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs - 1);
+			xfs_bmbt_log_keys(cur, bp, ptr, numrecs - 1);
+		}
+	} else {
+		rp = XFS_BMAP_REC_IADDR(block, 1, cur);
+		if (ptr < numrecs) {
+			ovbcopy(&rp[ptr], &rp[ptr - 1],
+				(numrecs - ptr) * sizeof(*rp));
+			xfs_bmbt_log_recs(cur, bp, ptr, numrecs - 1);
+		}
+		if (ptr == 1) {
+			INT_SET(key.br_startoff, ARCH_CONVERT, xfs_bmbt_get_startoff(rp));
+			kp = &key;
+		}
+	}
+	numrecs--;
+	INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs);
+	xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
+	/*
+	 * We're at the root level.
+	 * First, shrink the root block in-memory.
+	 * Try to get rid of the next level down.
+	 * If we can't then there's nothing left to do.
+	 */
+	if (level == cur->bc_nlevels - 1) {
+		xfs_iroot_realloc(cur->bc_private.b.ip, -1,
+			cur->bc_private.b.whichfork);
+		if ((error = xfs_bmbt_killroot(cur, async))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 1;
+		return 0;
+	}
+	if (ptr == 1 && (error = xfs_bmbt_updkey(cur, kp, level + 1))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		goto error0;
+	}
+	if (numrecs >= XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
+		if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 1;
+		return 0;
+	}
+	rbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
+	lbno = INT_GET(block->bb_leftsib, ARCH_CONVERT);
+	/*
+	 * One child of root, need to get a chance to copy its contents
+	 * into the root and delete it. Can't go up to next level,
+	 * there's nothing to delete there.
+	 */
+	if (lbno == NULLFSBLOCK && rbno == NULLFSBLOCK &&
+	    level == cur->bc_nlevels - 2) {
+		if ((error = xfs_bmbt_killroot(cur, async))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 1;
+		return 0;
+	}
+	ASSERT(rbno != NULLFSBLOCK || lbno != NULLFSBLOCK);
+	if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		goto error0;
+	}
+	bno = NULLFSBLOCK;
+	if (rbno != NULLFSBLOCK) {
+		i = xfs_btree_lastrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if ((error = xfs_bmbt_increment(tcur, level, &i))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		i = xfs_btree_lastrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		rbp = tcur->bc_bufs[level];
+		right = XFS_BUF_TO_BMBT_BLOCK(rbp);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+#endif
+		bno = INT_GET(right->bb_leftsib, ARCH_CONVERT);
+		if (INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1 >=
+		    XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
+			if ((error = xfs_bmbt_lshift(tcur, level, &i))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				goto error0;
+			}
+			if (i) {
+				ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
+				       XFS_BMAP_BLOCK_IMINRECS(level, tcur));
+				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+				tcur = NULL;
+				if (level > 0) {
+					if ((error = xfs_bmbt_decrement(cur,
+							level, &i))) {
+						XFS_BMBT_TRACE_CURSOR(cur,
+							ERROR);
+						goto error0;
+					}
+				}
+				XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+				*stat = 1;
+				return 0;
+			}
+		}
+		rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT);
+		if (lbno != NULLFSBLOCK) {
+			i = xfs_btree_firstrec(tcur, level);
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				goto error0;
+			}
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		}
+	}
+	if (lbno != NULLFSBLOCK) {
+		i = xfs_btree_firstrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * decrement to last in block
+		 */
+		if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		i = xfs_btree_firstrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		lbp = tcur->bc_bufs[level];
+		left = XFS_BUF_TO_BMBT_BLOCK(lbp);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+#endif
+		bno = INT_GET(left->bb_rightsib, ARCH_CONVERT);
+		if (INT_GET(left->bb_numrecs, ARCH_CONVERT) - 1 >=
+		    XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
+			if ((error = xfs_bmbt_rshift(tcur, level, &i))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				goto error0;
+			}
+			if (i) {
+				ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
+				       XFS_BMAP_BLOCK_IMINRECS(level, tcur));
+				xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+				tcur = NULL;
+				if (level == 0)
+					cur->bc_ptrs[0]++;
+				XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+				*stat = 1;
+				return 0;
+			}
+		}
+		lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
+	}
+	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+	tcur = NULL;
+	mp = cur->bc_mp;
+	ASSERT(bno != NULLFSBLOCK);
+	if (lbno != NULLFSBLOCK &&
+	    lrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
+		rbno = bno;
+		right = block;
+		rbp = bp;
+		if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, lbno, 0, &lbp,
+				XFS_BMAP_BTREE_REF))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		left = XFS_BUF_TO_BMBT_BLOCK(lbp);
+		if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+	} else if (rbno != NULLFSBLOCK &&
+		   rrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <=
+		   XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
+		lbno = bno;
+		left = block;
+		lbp = bp;
+		if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, rbno, 0, &rbp,
+				XFS_BMAP_BTREE_REF))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		right = XFS_BUF_TO_BMBT_BLOCK(rbp);
+		if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
+	} else {
+		if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 1;
+		return 0;
+	}
+	numlrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
+	numrrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT);
+	if (level > 0) {
+		lkp = XFS_BMAP_KEY_IADDR(left, numlrecs + 1, cur);
+		lpp = XFS_BMAP_PTR_IADDR(left, numlrecs + 1, cur);
+		rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
+		rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
+#ifdef DEBUG
+		for (i = 0; i < numrrecs; i++) {
+			if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				goto error0;
+			}
+		}
+#endif
+		bcopy(rkp, lkp, numrrecs * sizeof(*lkp));
+		bcopy(rpp, lpp, numrrecs * sizeof(*lpp));
+		xfs_bmbt_log_keys(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
+		xfs_bmbt_log_ptrs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
+	} else {
+		lrp = XFS_BMAP_REC_IADDR(left, numlrecs + 1, cur);
+		rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
+		bcopy(rrp, lrp, numrrecs * sizeof(*lrp));
+		xfs_bmbt_log_recs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
+	}
+	INT_MOD(left->bb_numrecs, ARCH_CONVERT, numrrecs);
+	left->bb_rightsib = right->bb_rightsib; /* INT_: direct copy */
+	xfs_bmbt_log_block(cur, lbp, XFS_BB_RIGHTSIB | XFS_BB_NUMRECS);
+	if (INT_GET(left->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) {
+		if ((error = xfs_btree_read_bufl(mp, cur->bc_tp,
+				INT_GET(left->bb_rightsib, ARCH_CONVERT), 0, &rrbp,
+				XFS_BMAP_BTREE_REF))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
+		if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			goto error0;
+		}
+		INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, lbno);
+		xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+	}
+	xfs_bmap_add_free(XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(rbp)), 1,
+		cur->bc_private.b.flist, mp);
+	if (!async)
+		xfs_trans_set_sync(cur->bc_tp);
+	cur->bc_private.b.ip->i_d.di_nblocks--;
+	xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
+	if (XFS_IS_QUOTA_ON(mp) &&
+	    cur->bc_private.b.ip->i_ino != mp->m_sb.sb_uquotino &&
+	    cur->bc_private.b.ip->i_ino != mp->m_sb.sb_gquotino)
+		xfs_trans_mod_dquot_byino(cur->bc_tp, cur->bc_private.b.ip,
+			XFS_TRANS_DQ_BCOUNT, -1L);
+	xfs_trans_binval(cur->bc_tp, rbp);
+	if (bp != lbp) {
+		cur->bc_bufs[level] = lbp;
+		cur->bc_ptrs[level] += lrecs;
+		cur->bc_ra[level] = 0;
+	} else if ((error = xfs_bmbt_increment(cur, level + 1, &i))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		goto error0;
+	}
+	if (level > 0)
+		cur->bc_ptrs[level]--;
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	*stat = 2;
+	return 0;
+
+error0:
+	if (tcur)
+		xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+	return error;
+}
+
+#ifdef XFSDEBUG
+/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_bmbt_get_rec(
+	xfs_btree_cur_t		*cur,
+	xfs_fileoff_t		*off,
+	xfs_fsblock_t		*bno,
+	xfs_filblks_t		*len,
+	xfs_exntst_t		*state,
+	int			*stat)
+{
+	xfs_bmbt_block_t	*block;
+	xfs_buf_t		*bp;
+#ifdef DEBUG
+	int			error;
+#endif
+	int			ptr;
+	xfs_bmbt_rec_t		*rp;
+
+	block = xfs_bmbt_get_block(cur, 0, &bp);
+	ptr = cur->bc_ptrs[0];
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lblock(cur, block, 0, bp)))
+		return error;
+#endif
+	if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT) || ptr <= 0) {
+		*stat = 0;
+		return 0;
+	}
+	rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
+	*off = xfs_bmbt_get_startoff(rp);
+	*bno = xfs_bmbt_get_startblock(rp);
+	*len = xfs_bmbt_get_blockcount(rp);
+	*state = xfs_bmbt_get_state(rp);
+	*stat = 1;
+	return 0;
+}
+#endif
+
+/*
+ * Insert one record/level.  Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int					/* error */
+xfs_bmbt_insrec(
+	xfs_btree_cur_t		*cur,
+	int			level,
+	xfs_fsblock_t		*bnop,
+	xfs_bmbt_rec_t		*recp,
+	xfs_btree_cur_t		**curp,
+	int			*stat)		/* no-go/done/continue */
+{
+	xfs_bmbt_block_t	*block;		/* bmap btree block */
+	xfs_buf_t		*bp;		/* buffer for block */
+	int			error;		/* error return value */
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_insrec";
+#endif
+	int			i;		/* loop index */
+	xfs_bmbt_key_t		key;		/* bmap btree key */
+	xfs_bmbt_key_t		*kp=NULL;	/* pointer to bmap btree key */
+	int			logflags;	/* inode logging flags */
+	xfs_fsblock_t		nbno;		/* new block number */
+	struct xfs_btree_cur	*ncur;		/* new btree cursor */
+	xfs_bmbt_key_t		nkey;		/* new btree key value */
+	xfs_bmbt_rec_t		nrec;		/* new record count */
+	int			optr;		/* old key/record index */
+	xfs_bmbt_ptr_t		*pp;		/* pointer to bmap block addr */
+	int			ptr;		/* key/record index */
+	xfs_bmbt_rec_t		*rp=NULL;	/* pointer to bmap btree rec */
+	int			numrecs;
+
+	ASSERT(level < cur->bc_nlevels);
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGIFR(cur, level, *bnop, recp);
+	ncur = (xfs_btree_cur_t *)0;
+	INT_SET(key.br_startoff, ARCH_CONVERT, xfs_bmbt_get_startoff(recp));
+	optr = ptr = cur->bc_ptrs[level];
+	if (ptr == 0) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	XFS_STATS_INC(xfsstats.xs_bmbt_insrec);
+	block = xfs_bmbt_get_block(cur, level, &bp);
+	numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+	if (ptr <= numrecs) {
+		if (level == 0) {
+			rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
+			xfs_btree_check_rec(XFS_BTNUM_BMAP, recp, rp);
+		} else {
+			kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
+			xfs_btree_check_key(XFS_BTNUM_BMAP, &key, kp);
+		}
+	}
+#endif
+	nbno = NULLFSBLOCK;
+	if (numrecs == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
+		if (numrecs < XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
+			/*
+			 * A root block, that can be made bigger.
+			 */
+			xfs_iroot_realloc(cur->bc_private.b.ip, 1,
+				cur->bc_private.b.whichfork);
+			block = xfs_bmbt_get_block(cur, level, &bp);
+		} else if (level == cur->bc_nlevels - 1) {
+			if ((error = xfs_bmbt_newroot(cur, &logflags, stat)) ||
+			    *stat == 0) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				return error;
+			}
+			xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+				logflags);
+			block = xfs_bmbt_get_block(cur, level, &bp);
+		} else {
+			if ((error = xfs_bmbt_rshift(cur, level, &i))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				return error;
+			}
+			if (i) {
+				/* nothing */
+			} else {
+				if ((error = xfs_bmbt_lshift(cur, level, &i))) {
+					XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+					return error;
+				}
+				if (i) {
+					optr = ptr = cur->bc_ptrs[level];
+				} else {
+					if ((error = xfs_bmbt_split(cur, level,
+							&nbno, &nkey, &ncur,
+							&i))) {
+						XFS_BMBT_TRACE_CURSOR(cur,
+							ERROR);
+						return error;
+					}
+					if (i) {
+						block = xfs_bmbt_get_block(
+							    cur, level, &bp);
+#ifdef DEBUG
+						if ((error =
+						    xfs_btree_check_lblock(cur,
+							    block, level, bp))) {
+							XFS_BMBT_TRACE_CURSOR(
+								cur, ERROR);
+							return error;
+						}
+#endif
+						ptr = cur->bc_ptrs[level];
+						xfs_bmbt_set_allf(&nrec,
+							nkey.br_startoff, 0, 0,
+							XFS_EXT_NORM);
+					} else {
+						XFS_BMBT_TRACE_CURSOR(cur,
+							EXIT);
+						*stat = 0;
+						return 0;
+					}
+				}
+			}
+		}
+	}
+	numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+	if (level > 0) {
+		kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
+		pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
+#ifdef DEBUG
+		for (i = numrecs; i >= ptr; i--) {
+			if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT),
+					level))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				return error;
+			}
+		}
+#endif
+		ovbcopy(&kp[ptr - 1], &kp[ptr],
+			(numrecs - ptr + 1) * sizeof(*kp));
+		ovbcopy(&pp[ptr - 1], &pp[ptr], /* INT_: direct copy */
+			(numrecs - ptr + 1) * sizeof(*pp));
+#ifdef DEBUG
+		if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)*bnop,
+				level))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+#endif
+		kp[ptr - 1] = key;
+		INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop);
+		numrecs++;
+		INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs);
+		xfs_bmbt_log_keys(cur, bp, ptr, numrecs);
+		xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs);
+	} else {
+		rp = XFS_BMAP_REC_IADDR(block, 1, cur);
+		ovbcopy(&rp[ptr - 1], &rp[ptr],
+			(numrecs - ptr + 1) * sizeof(*rp));
+		rp[ptr - 1] = *recp;
+		numrecs++;
+		INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs);
+		xfs_bmbt_log_recs(cur, bp, ptr, numrecs);
+	}
+	xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
+#ifdef DEBUG
+	if (ptr < numrecs) {
+		if (level == 0)
+			xfs_btree_check_rec(XFS_BTNUM_BMAP, rp + ptr - 1,
+				rp + ptr);
+		else
+			xfs_btree_check_key(XFS_BTNUM_BMAP, kp + ptr - 1,
+				kp + ptr);
+	}
+#endif
+	if (optr == 1 && (error = xfs_bmbt_updkey(cur, &key, level + 1))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+	*bnop = nbno;
+	if (nbno != NULLFSBLOCK) {
+		*recp = nrec;
+		*curp = ncur;
+	}
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	*stat = 1;
+	return 0;
+}
+
+STATIC int
+xfs_bmbt_killroot(
+	xfs_btree_cur_t		*cur,
+	int			async)
+{
+	xfs_bmbt_block_t	*block;
+	xfs_bmbt_block_t	*cblock;
+	xfs_buf_t		*cbp;
+	xfs_bmbt_key_t		*ckp;
+	xfs_bmbt_ptr_t		*cpp;
+#ifdef DEBUG
+	int			error;
+#endif
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_killroot";
+#endif
+	int			i;
+	xfs_bmbt_key_t		*kp;
+	xfs_inode_t		*ip;
+	xfs_ifork_t		*ifp;
+	int			level;
+	xfs_bmbt_ptr_t		*pp;
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	level = cur->bc_nlevels - 1;
+	ASSERT(level >= 1);
+	/*
+	 * Don't deal with the root block needs to be a leaf case.
+	 * We're just going to turn the thing back into extents anyway.
+	 */
+	if (level == 1) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		return 0;
+	}
+	block = xfs_bmbt_get_block(cur, level, &cbp);
+	/*
+	 * Give up if the root has multiple children.
+	 */
+	if (INT_GET(block->bb_numrecs, ARCH_CONVERT) != 1) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		return 0;
+	}
+	/*
+	 * Only do this if the next level will fit.
+	 * Then the data must be copied up to the inode,
+	 * instead of freeing the root you free the next level.
+	 */
+	cbp = cur->bc_bufs[level - 1];
+	cblock = XFS_BUF_TO_BMBT_BLOCK(cbp);
+	if (INT_GET(cblock->bb_numrecs, ARCH_CONVERT) > XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		return 0;
+	}
+	ASSERT(INT_GET(cblock->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO);
+	ASSERT(INT_GET(cblock->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO);
+	ip = cur->bc_private.b.ip;
+	ifp = XFS_IFORK_PTR(ip, cur->bc_private.b.whichfork);
+	ASSERT(XFS_BMAP_BLOCK_IMAXRECS(level, cur) ==
+	       XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes));
+	i = (int)(INT_GET(cblock->bb_numrecs, ARCH_CONVERT) - XFS_BMAP_BLOCK_IMAXRECS(level, cur));
+	if (i) {
+		xfs_iroot_realloc(ip, i, cur->bc_private.b.whichfork);
+		block = ifp->if_broot;
+	}
+	INT_MOD(block->bb_numrecs, ARCH_CONVERT, i);
+	ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) == INT_GET(cblock->bb_numrecs, ARCH_CONVERT));
+	kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
+	ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
+	bcopy(ckp, kp, INT_GET(block->bb_numrecs, ARCH_CONVERT) * sizeof(*kp));
+	pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
+	cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
+#ifdef DEBUG
+	for (i = 0; i < INT_GET(cblock->bb_numrecs, ARCH_CONVERT); i++) {
+		if ((error = xfs_btree_check_lptr(cur, INT_GET(cpp[i], ARCH_CONVERT), level - 1))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+	}
+#endif
+	bcopy(cpp, pp, INT_GET(block->bb_numrecs, ARCH_CONVERT) * sizeof(*pp));
+	xfs_bmap_add_free(XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(cbp)), 1,
+		cur->bc_private.b.flist, cur->bc_mp);
+	if (!async)
+		xfs_trans_set_sync(cur->bc_tp);
+	ip->i_d.di_nblocks--;
+	if (XFS_IS_QUOTA_ON(cur->bc_mp) &&
+	    ip->i_ino != cur->bc_mp->m_sb.sb_uquotino &&
+	    ip->i_ino != cur->bc_mp->m_sb.sb_gquotino)
+		xfs_trans_mod_dquot_byino(cur->bc_tp, ip, XFS_TRANS_DQ_BCOUNT,
+			-1L);
+	xfs_trans_binval(cur->bc_tp, cbp);
+	cur->bc_bufs[level - 1] = NULL;
+	INT_MOD(block->bb_level, ARCH_CONVERT, -1);
+	xfs_trans_log_inode(cur->bc_tp, ip,
+		XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
+	cur->bc_nlevels--;
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	return 0;
+}
+
+/*
+ * Log key values from the btree block.
+ */
+STATIC void
+xfs_bmbt_log_keys(
+	xfs_btree_cur_t *cur,
+	xfs_buf_t	*bp,
+	int		kfirst,
+	int		klast)
+{
+#ifdef XFS_BMBT_TRACE
+	static char	fname[] = "xfs_bmbt_log_keys";
+#endif
+	xfs_trans_t	*tp;
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGBII(cur, bp, kfirst, klast);
+	tp = cur->bc_tp;
+	if (bp) {
+		xfs_bmbt_block_t	*block;
+		int			first;
+		xfs_bmbt_key_t		*kp;
+		int			last;
+
+		block = XFS_BUF_TO_BMBT_BLOCK(bp);
+		kp = XFS_BMAP_KEY_DADDR(block, 1, cur);
+		first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
+		last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
+		xfs_trans_log_buf(tp, bp, first, last);
+	} else {
+		xfs_inode_t		 *ip;
+
+		ip = cur->bc_private.b.ip;
+		xfs_trans_log_inode(tp, ip,
+			XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
+	}
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+}
+
+/*
+ * Log pointer values from the btree block.
+ */
+STATIC void
+xfs_bmbt_log_ptrs(
+	xfs_btree_cur_t *cur,
+	xfs_buf_t	*bp,
+	int		pfirst,
+	int		plast)
+{
+#ifdef XFS_BMBT_TRACE
+	static char	fname[] = "xfs_bmbt_log_ptrs";
+#endif
+	xfs_trans_t	*tp;
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGBII(cur, bp, pfirst, plast);
+	tp = cur->bc_tp;
+	if (bp) {
+		xfs_bmbt_block_t	*block;
+		int			first;
+		int			last;
+		xfs_bmbt_ptr_t		*pp;
+
+		block = XFS_BUF_TO_BMBT_BLOCK(bp);
+		pp = XFS_BMAP_PTR_DADDR(block, 1, cur);
+		first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
+		last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
+		xfs_trans_log_buf(tp, bp, first, last);
+	} else {
+		xfs_inode_t		*ip;
+
+		ip = cur->bc_private.b.ip;
+		xfs_trans_log_inode(tp, ip,
+			XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
+	}
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+}
+
+/*
+ * Lookup the record.  The cursor is made to point to it, based on dir.
+ */
+STATIC int				/* error */
+xfs_bmbt_lookup(
+	xfs_btree_cur_t		*cur,
+	xfs_lookup_t		dir,
+	int			*stat)		/* success/failure */
+{
+	xfs_bmbt_block_t	*block=NULL;
+	xfs_buf_t		*bp;
+	xfs_daddr_t		d;
+	xfs_sfiloff_t		diff;
+	int			error;		/* error return value */
+#ifdef XFS_BMBT_TRACE
+	static char	fname[] = "xfs_bmbt_lookup";
+#endif
+	xfs_fsblock_t		fsbno=0;
+	int			high;
+	int			i;
+	int			keyno=0;
+	xfs_bmbt_key_t		*kkbase=NULL;
+	xfs_bmbt_key_t		*kkp;
+	xfs_bmbt_rec_t		*krbase=NULL;
+	xfs_bmbt_rec_t		*krp;
+	int			level;
+	int			low;
+	xfs_mount_t		*mp;
+	xfs_bmbt_ptr_t		*pp;
+	xfs_bmbt_irec_t		*rp;
+	xfs_fileoff_t		startoff;
+	xfs_trans_t		*tp;
+
+	XFS_STATS_INC(xfsstats.xs_bmbt_lookup);
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGI(cur, (int)dir);
+	tp = cur->bc_tp;
+	mp = cur->bc_mp;
+	rp = &cur->bc_rec.b;
+	for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+		if (level < cur->bc_nlevels - 1) {
+			d = XFS_FSB_TO_DADDR(mp, fsbno);
+			bp = cur->bc_bufs[level];
+			if (bp && XFS_BUF_ADDR(bp) != d)
+				bp = (xfs_buf_t *)0;
+			if (!bp) {
+				if ((error = xfs_btree_read_bufl(mp, tp, fsbno,
+						0, &bp, XFS_BMAP_BTREE_REF))) {
+					XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+					return error;
+				}
+				xfs_btree_setbuf(cur, level, bp);
+				block = XFS_BUF_TO_BMBT_BLOCK(bp);
+				if ((error = xfs_btree_check_lblock(cur, block,
+						level, bp))) {
+					XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+					return error;
+				}
+			} else
+				block = XFS_BUF_TO_BMBT_BLOCK(bp);
+		} else
+			block = xfs_bmbt_get_block(cur, level, &bp);
+		if (diff == 0)
+			keyno = 1;
+		else {
+			if (level > 0)
+				kkbase = XFS_BMAP_KEY_IADDR(block, 1, cur);
+			else
+				krbase = XFS_BMAP_REC_IADDR(block, 1, cur);
+			low = 1;
+			if (!(high = INT_GET(block->bb_numrecs, ARCH_CONVERT))) {
+				ASSERT(level == 0);
+				cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+				XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+				*stat = 0;
+				return 0;
+			}
+			while (low <= high) {
+				XFS_STATS_INC(xfsstats.xs_bmbt_compare);
+				keyno = (low + high) >> 1;
+				if (level > 0) {
+					kkp = kkbase + keyno - 1;
+					startoff = INT_GET(kkp->br_startoff, ARCH_CONVERT);
+				} else {
+					krp = krbase + keyno - 1;
+					startoff = xfs_bmbt_get_startoff(krp);
+				}
+				diff = (xfs_sfiloff_t)
+						(startoff - rp->br_startoff);
+				if (diff < 0)
+					low = keyno + 1;
+				else if (diff > 0)
+					high = keyno - 1;
+				else
+					break;
+			}
+		}
+		if (level > 0) {
+			if (diff > 0 && --keyno < 1)
+				keyno = 1;
+			pp = XFS_BMAP_PTR_IADDR(block, keyno, cur);
+#ifdef DEBUG
+			if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				return error;
+			}
+#endif
+			fsbno = INT_GET(*pp, ARCH_CONVERT);
+			cur->bc_ptrs[level] = keyno;
+		}
+	}
+	if (dir != XFS_LOOKUP_LE && diff < 0) {
+		keyno++;
+		/*
+		 * If ge search and we went off the end of the block, but it's
+		 * not the last block, we're in the wrong block.
+		 */
+		if (dir == XFS_LOOKUP_GE && keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT) &&
+		    INT_GET(block->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) {
+			cur->bc_ptrs[0] = keyno;
+			if ((error = xfs_bmbt_increment(cur, 0, &i))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				return error;
+			}
+			XFS_WANT_CORRUPTED_RETURN(i == 1);
+			XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+			*stat = 1;
+			return 0;
+		}
+	}
+	else if (dir == XFS_LOOKUP_LE && diff > 0)
+		keyno--;
+	cur->bc_ptrs[0] = keyno;
+	if (keyno == 0 || keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+	} else {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
+	}
+	return 0;
+}
+
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int					/* error */
+xfs_bmbt_lshift(
+	xfs_btree_cur_t		*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	int			error;		/* error return value */
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_lshift";
+#endif
+#ifdef DEBUG
+	int			i;		/* loop counter */
+#endif
+	xfs_bmbt_key_t		key;		/* bmap btree key */
+	xfs_buf_t		*lbp;		/* left buffer pointer */
+	xfs_bmbt_block_t	*left;		/* left btree block */
+	xfs_bmbt_key_t		*lkp=NULL;	/* left btree key */
+	xfs_bmbt_ptr_t		*lpp;		/* left address pointer */
+	int			lrecs;		/* left record count */
+	xfs_bmbt_rec_t		*lrp=NULL;	/* left record pointer */
+	xfs_mount_t		*mp;		/* file system mount point */
+	xfs_buf_t		*rbp;		/* right buffer pointer */
+	xfs_bmbt_block_t	*right;		/* right btree block */
+	xfs_bmbt_key_t		*rkp=NULL;	/* right btree key */
+	xfs_bmbt_ptr_t		*rpp=NULL;	/* right address pointer */
+	xfs_bmbt_rec_t		*rrp=NULL;	/* right record pointer */
+	int			rrecs;		/* right record count */
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGI(cur, level);
+	if (level == cur->bc_nlevels - 1) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	rbp = cur->bc_bufs[level];
+	right = XFS_BUF_TO_BMBT_BLOCK(rbp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+#endif
+	if (INT_GET(right->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	if (cur->bc_ptrs[level] <= 1) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	mp = cur->bc_mp;
+	if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, INT_GET(right->bb_leftsib, ARCH_CONVERT), 0,
+			&lbp, XFS_BMAP_BTREE_REF))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+	left = XFS_BUF_TO_BMBT_BLOCK(lbp);
+	if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+	if (INT_GET(left->bb_numrecs, ARCH_CONVERT) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1;
+	if (level > 0) {
+		lkp = XFS_BMAP_KEY_IADDR(left, lrecs, cur);
+		rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
+		*lkp = *rkp;
+		xfs_bmbt_log_keys(cur, lbp, lrecs, lrecs);
+		lpp = XFS_BMAP_PTR_IADDR(left, lrecs, cur);
+		rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_lptr(cur, INT_GET(*rpp, ARCH_CONVERT), level))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+#endif
+		*lpp = *rpp; /* INT_: direct copy */
+		xfs_bmbt_log_ptrs(cur, lbp, lrecs, lrecs);
+	} else {
+		lrp = XFS_BMAP_REC_IADDR(left, lrecs, cur);
+		rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
+		*lrp = *rrp;
+		xfs_bmbt_log_recs(cur, lbp, lrecs, lrecs);
+	}
+	INT_SET(left->bb_numrecs, ARCH_CONVERT, lrecs);
+	xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
+#ifdef DEBUG
+	if (level > 0)
+		xfs_btree_check_key(XFS_BTNUM_BMAP, lkp - 1, lkp);
+	else
+		xfs_btree_check_rec(XFS_BTNUM_BMAP, lrp - 1, lrp);
+#endif
+	rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1;
+	INT_SET(right->bb_numrecs, ARCH_CONVERT, rrecs);
+	xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
+	if (level > 0) {
+#ifdef DEBUG
+		for (i = 0; i < rrecs; i++) {
+			if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i + 1], ARCH_CONVERT),
+					level))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				return error;
+			}
+		}
+#endif
+		ovbcopy(rkp + 1, rkp, rrecs * sizeof(*rkp));
+		ovbcopy(rpp + 1, rpp, rrecs * sizeof(*rpp));
+		xfs_bmbt_log_keys(cur, rbp, 1, rrecs);
+		xfs_bmbt_log_ptrs(cur, rbp, 1, rrecs);
+	} else {
+		ovbcopy(rrp + 1, rrp, rrecs * sizeof(*rrp));
+		xfs_bmbt_log_recs(cur, rbp, 1, rrecs);
+		INT_SET(key.br_startoff, ARCH_CONVERT, xfs_bmbt_get_startoff(rrp));
+		rkp = &key;
+	}
+	if ((error = xfs_bmbt_updkey(cur, rkp, level + 1))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+	cur->bc_ptrs[level]--;
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int					/* error */
+xfs_bmbt_rshift(
+	xfs_btree_cur_t		*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	int			error;		/* error return value */
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_rshift";
+#endif
+	int			i;		/* loop counter */
+	xfs_bmbt_key_t		key;		/* bmap btree key */
+	xfs_buf_t		*lbp;		/* left buffer pointer */
+	xfs_bmbt_block_t	*left;		/* left btree block */
+	xfs_bmbt_key_t		*lkp;		/* left btree key */
+	xfs_bmbt_ptr_t		*lpp;		/* left address pointer */
+	xfs_bmbt_rec_t		*lrp;		/* left record pointer */
+	xfs_mount_t		*mp;		/* file system mount point */
+	xfs_buf_t		*rbp;		/* right buffer pointer */
+	xfs_bmbt_block_t	*right;		/* right btree block */
+	xfs_bmbt_key_t		*rkp;		/* right btree key */
+	xfs_bmbt_ptr_t		*rpp;		/* right address pointer */
+	xfs_bmbt_rec_t		*rrp=NULL;	/* right record pointer */
+	struct xfs_btree_cur	*tcur;		/* temporary btree cursor */
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGI(cur, level);
+	if (level == cur->bc_nlevels - 1) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	lbp = cur->bc_bufs[level];
+	left = XFS_BUF_TO_BMBT_BLOCK(lbp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+#endif
+	if (INT_GET(left->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	if (cur->bc_ptrs[level] >= INT_GET(left->bb_numrecs, ARCH_CONVERT)) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	mp = cur->bc_mp;
+	if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0,
+			&rbp, XFS_BMAP_BTREE_REF))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+	right = XFS_BUF_TO_BMBT_BLOCK(rbp);
+	if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+	if (INT_GET(right->bb_numrecs, ARCH_CONVERT) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	if (level > 0) {
+		lkp = XFS_BMAP_KEY_IADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+		lpp = XFS_BMAP_PTR_IADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+		rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
+		rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
+#ifdef DEBUG
+		for (i = INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1; i >= 0; i--) {
+			if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				return error;
+			}
+		}
+#endif
+		ovbcopy(rkp, rkp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
+		ovbcopy(rpp, rpp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
+#ifdef DEBUG
+		if ((error = xfs_btree_check_lptr(cur, INT_GET(*lpp, ARCH_CONVERT), level))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+#endif
+		*rkp = *lkp;
+		*rpp = *lpp; /* INT_: direct copy */
+		xfs_bmbt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+		xfs_bmbt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+	} else {
+		lrp = XFS_BMAP_REC_IADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+		rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
+		ovbcopy(rrp, rrp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+		*rrp = *lrp;
+		xfs_bmbt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+		INT_SET(key.br_startoff, ARCH_CONVERT, xfs_bmbt_get_startoff(rrp));
+		rkp = &key;
+	}
+	INT_MOD(left->bb_numrecs, ARCH_CONVERT, -1);
+	xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
+	INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
+#ifdef DEBUG
+	if (level > 0)
+		xfs_btree_check_key(XFS_BTNUM_BMAP, rkp, rkp + 1);
+	else
+		xfs_btree_check_rec(XFS_BTNUM_BMAP, rrp, rrp + 1);
+#endif
+	xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
+	if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+	i = xfs_btree_lastrec(tcur, level);
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	if ((error = xfs_bmbt_increment(tcur, level, &i))) {
+		XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
+		goto error1;
+	}
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	if ((error = xfs_bmbt_updkey(tcur, rkp, level + 1))) {
+		XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
+		goto error1;
+	}
+	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	*stat = 1;
+	return 0;
+error0:
+	XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+error1:
+	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Determine the extent state.
+ */
+/* ARGSUSED */
+STATIC xfs_exntst_t
+xfs_extent_state(
+	xfs_filblks_t		blks,
+	int			extent_flag)
+{
+	if (extent_flag) {
+		ASSERT(blks != 0);	/* saved for DMIG */
+		return XFS_EXT_UNWRITTEN;
+	}
+	return XFS_EXT_NORM;
+}
+
+
+/*
+ * Split cur/level block in half.
+ * Return new block number and its first record (to be inserted into parent).
+ */
+STATIC int					/* error */
+xfs_bmbt_split(
+	xfs_btree_cur_t		*cur,
+	int			level,
+	xfs_fsblock_t		*bnop,
+	xfs_bmbt_key_t		*keyp,
+	xfs_btree_cur_t		**curp,
+	int			*stat)		/* success/failure */
+{
+	xfs_alloc_arg_t		args;		/* block allocation args */
+	int			error;		/* error return value */
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_split";
+#endif
+	int			i;		/* loop counter */
+	xfs_fsblock_t		lbno;		/* left sibling block number */
+	xfs_buf_t		*lbp;		/* left buffer pointer */
+	xfs_bmbt_block_t	*left;		/* left btree block */
+	xfs_bmbt_key_t		*lkp;		/* left btree key */
+	xfs_bmbt_ptr_t		*lpp;		/* left address pointer */
+	xfs_bmbt_rec_t		*lrp;		/* left record pointer */
+	xfs_buf_t		*rbp;		/* right buffer pointer */
+	xfs_bmbt_block_t	*right;		/* right btree block */
+	xfs_bmbt_key_t		*rkp;		/* right btree key */
+	xfs_bmbt_ptr_t		*rpp;		/* right address pointer */
+	xfs_bmbt_block_t	*rrblock;	/* right-right btree block */
+	xfs_buf_t		*rrbp;		/* right-right buffer pointer */
+	xfs_bmbt_rec_t		*rrp;		/* right record pointer */
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, keyp);
+	args.tp = cur->bc_tp;
+	args.mp = cur->bc_mp;
+	lbp = cur->bc_bufs[level];
+	lbno = XFS_DADDR_TO_FSB(args.mp, XFS_BUF_ADDR(lbp));
+	left = XFS_BUF_TO_BMBT_BLOCK(lbp);
+	args.fsbno = cur->bc_private.b.firstblock;
+	if (args.fsbno == NULLFSBLOCK) {
+		args.fsbno = lbno;
+		args.type = XFS_ALLOCTYPE_START_BNO;
+	} else if (cur->bc_private.b.flist->xbf_low)
+		args.type = XFS_ALLOCTYPE_FIRST_AG;
+	else
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	args.mod = args.minleft = args.alignment = args.total = args.isfl =
+		args.userdata = args.minalignslop = 0;
+	args.minlen = args.maxlen = args.prod = 1;
+	args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+	if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return XFS_ERROR(ENOSPC);
+	}
+	if ((error = xfs_alloc_vextent(&args))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+	if (args.fsbno == NULLFSBLOCK) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	ASSERT(args.len == 1);
+	cur->bc_private.b.firstblock = args.fsbno;
+	cur->bc_private.b.allocated++;
+	cur->bc_private.b.ip->i_d.di_nblocks++;
+	xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
+	if (XFS_IS_QUOTA_ON(args.mp) &&
+	    cur->bc_private.b.ip->i_ino != args.mp->m_sb.sb_uquotino &&
+	    cur->bc_private.b.ip->i_ino != args.mp->m_sb.sb_gquotino)
+		xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip,
+			XFS_TRANS_DQ_BCOUNT, 1L);
+	rbp = xfs_btree_get_bufl(args.mp, args.tp, args.fsbno, 0);
+	right = XFS_BUF_TO_BMBT_BLOCK(rbp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lblock(cur, left, level, rbp))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+#endif
+	INT_SET(right->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC);
+	right->bb_level = left->bb_level; /* INT_: direct copy */
+	INT_SET(right->bb_numrecs, ARCH_CONVERT, (__uint16_t)(INT_GET(left->bb_numrecs, ARCH_CONVERT) / 2));
+	if ((INT_GET(left->bb_numrecs, ARCH_CONVERT) & 1) &&
+	    cur->bc_ptrs[level] <= INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1)
+		INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
+	i = INT_GET(left->bb_numrecs, ARCH_CONVERT) - INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1;
+	if (level > 0) {
+		lkp = XFS_BMAP_KEY_IADDR(left, i, cur);
+		lpp = XFS_BMAP_PTR_IADDR(left, i, cur);
+		rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
+		rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
+#ifdef DEBUG
+		for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
+			if ((error = xfs_btree_check_lptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level))) {
+				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+				return error;
+			}
+		}
+#endif
+		bcopy(lkp, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
+		bcopy(lpp, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
+		xfs_bmbt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		xfs_bmbt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		keyp->br_startoff = INT_GET(rkp->br_startoff, ARCH_CONVERT);
+	} else {
+		lrp = XFS_BMAP_REC_IADDR(left, i, cur);
+		rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
+		bcopy(lrp, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+		xfs_bmbt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		keyp->br_startoff = xfs_bmbt_get_startoff(rrp);
+	}
+	INT_MOD(left->bb_numrecs, ARCH_CONVERT, -(INT_GET(right->bb_numrecs, ARCH_CONVERT)));
+	right->bb_rightsib = left->bb_rightsib; /* INT_: direct copy */
+	INT_SET(left->bb_rightsib, ARCH_CONVERT, args.fsbno);
+	INT_SET(right->bb_leftsib, ARCH_CONVERT, lbno);
+	xfs_bmbt_log_block(cur, rbp, XFS_BB_ALL_BITS);
+	xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+	if (INT_GET(right->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) {
+		if ((error = xfs_btree_read_bufl(args.mp, args.tp,
+				INT_GET(right->bb_rightsib, ARCH_CONVERT), 0, &rrbp,
+				XFS_BMAP_BTREE_REF))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+		rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
+		if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+		INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, args.fsbno);
+		xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+	}
+	if (cur->bc_ptrs[level] > INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1) {
+		xfs_btree_setbuf(cur, level, rbp);
+		cur->bc_ptrs[level] -= INT_GET(left->bb_numrecs, ARCH_CONVERT);
+	}
+	if (level + 1 < cur->bc_nlevels) {
+		if ((error = xfs_btree_dup_cursor(cur, curp))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+		(*curp)->bc_ptrs[level + 1]++;
+	}
+	*bnop = args.fsbno;
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	*stat = 1;
+	return 0;
+}
+
+
+/*
+ * Update keys for the record.
+ */
+STATIC int
+xfs_bmbt_updkey(
+	xfs_btree_cur_t		*cur,
+	xfs_bmbt_key_t		*keyp,	/* on-disk format */
+	int			level)
+{
+	xfs_bmbt_block_t	*block;
+	xfs_buf_t		*bp;
+#ifdef DEBUG
+	int			error;
+#endif
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_updkey";
+#endif
+	xfs_bmbt_key_t		*kp;
+	int			ptr;
+
+	ASSERT(level >= 1);
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGIK(cur, level, keyp);
+	for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+		block = xfs_bmbt_get_block(cur, level, &bp);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+#endif
+		ptr = cur->bc_ptrs[level];
+		kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
+		*kp = *keyp;
+		xfs_bmbt_log_keys(cur, bp, ptr, ptr);
+	}
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	return 0;
+}
+
+/*
+ * Convert on-disk form of btree root to in-memory form.
+ */
+void
+xfs_bmdr_to_bmbt(
+	xfs_bmdr_block_t	*dblock,
+	int			dblocklen,
+	xfs_bmbt_block_t	*rblock,
+	int			rblocklen)
+{
+	int			dmxr;
+	xfs_bmbt_key_t		*fkp;
+	xfs_bmbt_ptr_t		*fpp;
+	xfs_bmbt_key_t		*tkp;
+	xfs_bmbt_ptr_t		*tpp;
+
+	INT_SET(rblock->bb_magic, ARCH_CONVERT, XFS_BMAP_MAGIC);
+	rblock->bb_level = dblock->bb_level;	/* both in on-disk format */
+	ASSERT(INT_GET(rblock->bb_level, ARCH_CONVERT) > 0);
+	rblock->bb_numrecs = dblock->bb_numrecs;/* both in on-disk format */
+	INT_SET(rblock->bb_leftsib, ARCH_CONVERT, NULLDFSBNO);
+	INT_SET(rblock->bb_rightsib, ARCH_CONVERT, NULLDFSBNO);
+	dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0);
+	fkp = XFS_BTREE_KEY_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr);
+	tkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen);
+	fpp = XFS_BTREE_PTR_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr);
+	tpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
+	dmxr = INT_GET(dblock->bb_numrecs, ARCH_CONVERT);
+	bcopy(fkp, tkp, sizeof(*fkp) * dmxr);
+	bcopy(fpp, tpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */
+}
+
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int						/* error */
+xfs_bmbt_decrement(
+	xfs_btree_cur_t		*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	xfs_bmbt_block_t	*block;
+	xfs_buf_t		*bp;
+	int			error;		/* error return value */
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_decrement";
+#endif
+	xfs_fsblock_t		fsbno;
+	int			lev;
+	xfs_mount_t		*mp;
+	xfs_trans_t		*tp;
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGI(cur, level);
+	ASSERT(level < cur->bc_nlevels);
+	if (level < cur->bc_nlevels - 1)
+		xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+	if (--cur->bc_ptrs[level] > 0) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 1;
+		return 0;
+	}
+	block = xfs_bmbt_get_block(cur, level, &bp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+#endif
+	if (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+		if (--cur->bc_ptrs[lev] > 0)
+			break;
+		if (lev < cur->bc_nlevels - 1)
+			xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+	}
+	if (lev == cur->bc_nlevels) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	tp = cur->bc_tp;
+	mp = cur->bc_mp;
+	for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
+		fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
+		if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
+				XFS_BMAP_BTREE_REF))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+		lev--;
+		xfs_btree_setbuf(cur, lev, bp);
+		block = XFS_BUF_TO_BMBT_BLOCK(bp);
+		if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+		cur->bc_ptrs[lev] = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+	}
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Delete the record pointed to by cur.
+ */
+int					/* error */
+xfs_bmbt_delete(
+	xfs_btree_cur_t *cur,
+	int		async,		/* deletion can be async */
+	int		*stat)		/* success/failure */
+{
+	int		error;		/* error return value */
+#ifdef XFS_BMBT_TRACE
+	static char	fname[] = "xfs_bmbt_delete";
+#endif
+	int		i;
+	int		level;
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	for (level = 0, i = 2; i == 2; level++) {
+		if ((error = xfs_bmbt_delrec(cur, level, async, &i))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+	}
+	if (i == 0) {
+		for (level = 1; level < cur->bc_nlevels; level++) {
+			if (cur->bc_ptrs[level] == 0) {
+				if ((error = xfs_bmbt_decrement(cur, level,
+						&i))) {
+					XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+					return error;
+				}
+				break;
+			}
+		}
+	}
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	*stat = i;
+	return 0;
+}
+
+/*
+ * Convert a compressed bmap extent record to an uncompressed form.
+ * This code must be in sync with the routines xfs_bmbt_get_startoff,
+ * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
+ */
+void
+xfs_bmbt_get_all(
+	xfs_bmbt_rec_t	*r,
+	xfs_bmbt_irec_t *s)
+{
+	int	ext_flag;
+	xfs_exntst_t st;
+	__uint64_t	l0, l1;
+
+	l0 = INT_GET(r->l0, ARCH_CONVERT);
+	l1 = INT_GET(r->l1, ARCH_CONVERT);
+	ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
+	s->br_startoff = ((xfs_fileoff_t)l0 &
+			   XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+#if XFS_BIG_FILESYSTEMS
+	s->br_startblock = (((xfs_fsblock_t)l0 & XFS_MASK64LO(9)) << 43) |
+			   (((xfs_fsblock_t)l1) >> 21);
+#else
+#ifdef DEBUG
+	{
+		xfs_dfsbno_t	b;
+
+		b = (((xfs_dfsbno_t)l0 & XFS_MASK64LO(9)) << 43) |
+		    (((xfs_dfsbno_t)l1) >> 21);
+		ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b));
+		s->br_startblock = (xfs_fsblock_t)b;
+	}
+#else	/* !DEBUG */
+	s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21);
+#endif	/* DEBUG */
+#endif	/* XFS_BIG_FILESYSTEMS */
+	s->br_blockcount = (xfs_filblks_t)(l1 & XFS_MASK64LO(21));
+	/* This is xfs_extent_state() in-line */
+	if (ext_flag) {
+		ASSERT(s->br_blockcount != 0);	/* saved for DMIG */
+		st = XFS_EXT_UNWRITTEN;
+	} else
+		st = XFS_EXT_NORM;
+	s->br_state = st;
+}
+
+/*
+ * Get the block pointer for the given level of the cursor.
+ * Fill in the buffer pointer, if applicable.
+ */
+xfs_bmbt_block_t *
+xfs_bmbt_get_block(
+	xfs_btree_cur_t		*cur,
+	int			level,
+	xfs_buf_t		**bpp)
+{
+	xfs_ifork_t		*ifp;
+	xfs_bmbt_block_t	*rval;
+
+	if (level < cur->bc_nlevels - 1) {
+		*bpp = cur->bc_bufs[level];
+		rval = XFS_BUF_TO_BMBT_BLOCK(*bpp);
+	} else {
+		*bpp = 0;
+		ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+			cur->bc_private.b.whichfork);
+		rval = ifp->if_broot;
+	}
+	return rval;
+}
+
+/*
+ * Extract the blockcount field from a bmap extent record.
+ */
+xfs_filblks_t
+xfs_bmbt_get_blockcount(
+	xfs_bmbt_rec_t	*r)
+{
+	return (xfs_filblks_t)(INT_GET(r->l1, ARCH_CONVERT) & XFS_MASK64LO(21));
+}
+
+/*
+ * Extract the startblock field from a bmap extent record.
+ */
+xfs_fsblock_t
+xfs_bmbt_get_startblock(
+	xfs_bmbt_rec_t	*r)
+{
+#if XFS_BIG_FILESYSTEMS
+	return (((xfs_fsblock_t)INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(9)) << 43) |
+	       (((xfs_fsblock_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21);
+#else
+#ifdef DEBUG
+	xfs_dfsbno_t	b;
+
+	b = (((xfs_dfsbno_t)INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(9)) << 43) |
+	    (((xfs_dfsbno_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21);
+	ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b));
+	return (xfs_fsblock_t)b;
+#else	/* !DEBUG */
+	return (xfs_fsblock_t)(((xfs_dfsbno_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21);
+#endif	/* DEBUG */
+#endif	/* XFS_BIG_FILESYSTEMS */
+}
+
+/*
+ * Extract the startoff field from a bmap extent record.
+ */
+xfs_fileoff_t
+xfs_bmbt_get_startoff(
+	xfs_bmbt_rec_t	*r)
+{
+	return ((xfs_fileoff_t)INT_GET(r->l0, ARCH_CONVERT) &
+		 XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+}
+
+xfs_exntst_t
+xfs_bmbt_get_state(
+	xfs_bmbt_rec_t	*r)
+{
+	int	ext_flag;
+
+	ext_flag = (int)((INT_GET(r->l0, ARCH_CONVERT)) >> (64 - BMBT_EXNTFLAG_BITLEN));
+	return xfs_extent_state(xfs_bmbt_get_blockcount(r),
+				ext_flag);
+}
+
+
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int						/* error */
+xfs_bmbt_increment(
+	xfs_btree_cur_t		*cur,
+	int			level,
+	int			*stat)		/* success/failure */
+{
+	xfs_bmbt_block_t	*block;
+	xfs_buf_t		*bp;
+	int			error;		/* error return value */
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_increment";
+#endif
+	xfs_fsblock_t		fsbno;
+	int			lev;
+	xfs_mount_t		*mp;
+	xfs_trans_t		*tp;
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGI(cur, level);
+	ASSERT(level < cur->bc_nlevels);
+	if (level < cur->bc_nlevels - 1)
+		xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+	block = xfs_bmbt_get_block(cur, level, &bp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+#endif
+	if (++cur->bc_ptrs[level] <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 1;
+		return 0;
+	}
+	if (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+		block = xfs_bmbt_get_block(cur, lev, &bp);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+#endif
+		if (++cur->bc_ptrs[lev] <= INT_GET(block->bb_numrecs, ARCH_CONVERT))
+			break;
+		if (lev < cur->bc_nlevels - 1)
+			xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+	}
+	if (lev == cur->bc_nlevels) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	tp = cur->bc_tp;
+	mp = cur->bc_mp;
+	for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
+		fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
+		if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
+				XFS_BMAP_BTREE_REF))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+		lev--;
+		xfs_btree_setbuf(cur, lev, bp);
+		block = XFS_BUF_TO_BMBT_BLOCK(bp);
+		if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+		cur->bc_ptrs[lev] = 1;
+	}
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Insert the current record at the point referenced by cur.
+ */
+int					/* error */
+xfs_bmbt_insert(
+	xfs_btree_cur_t *cur,
+	int		*stat)		/* success/failure */
+{
+	int		error;		/* error return value */
+#ifdef XFS_BMBT_TRACE
+	static char	fname[] = "xfs_bmbt_insert";
+#endif
+	int		i;
+	int		level;
+	xfs_fsblock_t	nbno;
+	xfs_btree_cur_t *ncur;
+	xfs_bmbt_rec_t	nrec;
+	xfs_btree_cur_t *pcur;
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	level = 0;
+	nbno = NULLFSBLOCK;
+	xfs_bmbt_set_all(&nrec, &cur->bc_rec.b);
+	ncur = (xfs_btree_cur_t *)0;
+	pcur = cur;
+	do {
+		if ((error = xfs_bmbt_insrec(pcur, level++, &nbno, &nrec, &ncur,
+				&i))) {
+			if (pcur != cur)
+				xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) {
+			cur->bc_nlevels = pcur->bc_nlevels;
+			cur->bc_private.b.allocated +=
+				pcur->bc_private.b.allocated;
+			pcur->bc_private.b.allocated = 0;
+			ASSERT((cur->bc_private.b.firstblock != NULLFSBLOCK) ||
+			       (cur->bc_private.b.ip->i_d.di_flags &
+				XFS_DIFLAG_REALTIME));
+			cur->bc_private.b.firstblock =
+				pcur->bc_private.b.firstblock;
+			ASSERT(cur->bc_private.b.flist ==
+			       pcur->bc_private.b.flist);
+			xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+		}
+		if (ncur) {
+			pcur = ncur;
+			ncur = (xfs_btree_cur_t *)0;
+		}
+	} while (nbno != NULLFSBLOCK);
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	*stat = i;
+	return 0;
+error0:
+	XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+	return error;
+}
+
+/*
+ * Log fields from the btree block header.
+ */
+void
+xfs_bmbt_log_block(
+	xfs_btree_cur_t		*cur,
+	xfs_buf_t		*bp,
+	int			fields)
+{
+	int			first;
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_log_block";
+#endif
+	int			last;
+	xfs_trans_t		*tp;
+	static const short	offsets[] = {
+		offsetof(xfs_bmbt_block_t, bb_magic),
+		offsetof(xfs_bmbt_block_t, bb_level),
+		offsetof(xfs_bmbt_block_t, bb_numrecs),
+		offsetof(xfs_bmbt_block_t, bb_leftsib),
+		offsetof(xfs_bmbt_block_t, bb_rightsib),
+		sizeof(xfs_bmbt_block_t)
+	};
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGBI(cur, bp, fields);
+	tp = cur->bc_tp;
+	if (bp) {
+		xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first,
+				  &last);
+		xfs_trans_log_buf(tp, bp, first, last);
+	} else
+		xfs_trans_log_inode(tp, cur->bc_private.b.ip,
+			XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+}
+
+/*
+ * Log record values from the btree block.
+ */
+void
+xfs_bmbt_log_recs(
+	xfs_btree_cur_t		*cur,
+	xfs_buf_t		*bp,
+	int			rfirst,
+	int			rlast)
+{
+	xfs_bmbt_block_t	*block;
+	int			first;
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_log_recs";
+#endif
+	int			last;
+	xfs_bmbt_rec_t		*rp;
+	xfs_trans_t		*tp;
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGBII(cur, bp, rfirst, rlast);
+	ASSERT(bp);
+	tp = cur->bc_tp;
+	block = XFS_BUF_TO_BMBT_BLOCK(bp);
+	rp = XFS_BMAP_REC_DADDR(block, 1, cur);
+	first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
+	last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
+	xfs_trans_log_buf(tp, bp, first, last);
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+}
+
+int					/* error */
+xfs_bmbt_lookup_eq(
+	xfs_btree_cur_t *cur,
+	xfs_fileoff_t	off,
+	xfs_fsblock_t	bno,
+	xfs_filblks_t	len,
+	int		*stat)		/* success/failure */
+{
+	cur->bc_rec.b.br_startoff = off;
+	cur->bc_rec.b.br_startblock = bno;
+	cur->bc_rec.b.br_blockcount = len;
+	return xfs_bmbt_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+int					/* error */
+xfs_bmbt_lookup_ge(
+	xfs_btree_cur_t *cur,
+	xfs_fileoff_t	off,
+	xfs_fsblock_t	bno,
+	xfs_filblks_t	len,
+	int		*stat)		/* success/failure */
+{
+	cur->bc_rec.b.br_startoff = off;
+	cur->bc_rec.b.br_startblock = bno;
+	cur->bc_rec.b.br_blockcount = len;
+	return xfs_bmbt_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+int					/* error */
+xfs_bmbt_lookup_le(
+	xfs_btree_cur_t *cur,
+	xfs_fileoff_t	off,
+	xfs_fsblock_t	bno,
+	xfs_filblks_t	len,
+	int		*stat)		/* success/failure */
+{
+	cur->bc_rec.b.br_startoff = off;
+	cur->bc_rec.b.br_startblock = bno;
+	cur->bc_rec.b.br_blockcount = len;
+	return xfs_bmbt_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Give the bmap btree a new root block.  Copy the old broot contents
+ * down into a real block and make the broot point to it.
+ */
+int						/* error */
+xfs_bmbt_newroot(
+	xfs_btree_cur_t		*cur,		/* btree cursor */
+	int			*logflags,	/* logging flags for inode */
+	int			*stat)		/* return status - 0 fail */
+{
+	xfs_alloc_arg_t		args;		/* allocation arguments */
+	xfs_bmbt_block_t	*block;		/* bmap btree block */
+	xfs_buf_t		*bp;		/* buffer for block */
+	xfs_bmbt_block_t	*cblock;	/* child btree block */
+	xfs_bmbt_key_t		*ckp;		/* child key pointer */
+	xfs_bmbt_ptr_t		*cpp;		/* child ptr pointer */
+	int			error;		/* error return code */
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_newroot";
+#endif
+#ifdef DEBUG
+	int			i;		/* loop counter */
+#endif
+	xfs_bmbt_key_t		*kp;		/* pointer to bmap btree key */
+	int			level;		/* btree level */
+	xfs_bmbt_ptr_t		*pp;		/* pointer to bmap block addr */
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	level = cur->bc_nlevels - 1;
+	block = xfs_bmbt_get_block(cur, level, &bp);
+	/*
+	 * Copy the root into a real block.
+	 */
+	args.mp = cur->bc_mp;
+	pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
+	args.tp = cur->bc_tp;
+	args.fsbno = cur->bc_private.b.firstblock;
+	args.mod = args.minleft = args.alignment = args.total = args.isfl =
+		args.userdata = args.minalignslop = 0;
+	args.minlen = args.maxlen = args.prod = 1;
+	args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+	if (args.fsbno == NULLFSBLOCK) {
+#ifdef DEBUG
+		if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+#endif
+		args.fsbno = INT_GET(*pp, ARCH_CONVERT);
+		args.type = XFS_ALLOCTYPE_START_BNO;
+	} else if (args.wasdel)
+		args.type = XFS_ALLOCTYPE_FIRST_AG;
+	else
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	if ((error = xfs_alloc_vextent(&args))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+	if (args.fsbno == NULLFSBLOCK) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		*stat = 0;
+		return 0;
+	}
+	ASSERT(args.len == 1);
+	cur->bc_private.b.firstblock = args.fsbno;
+	cur->bc_private.b.allocated++;
+	cur->bc_private.b.ip->i_d.di_nblocks++;
+	if (XFS_IS_QUOTA_ON(args.mp) &&
+	    cur->bc_private.b.ip->i_ino != args.mp->m_sb.sb_uquotino &&
+	    cur->bc_private.b.ip->i_ino != args.mp->m_sb.sb_gquotino)
+		xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip,
+					  XFS_TRANS_DQ_BCOUNT, 1L);
+	bp = xfs_btree_get_bufl(args.mp, cur->bc_tp, args.fsbno, 0);
+	cblock = XFS_BUF_TO_BMBT_BLOCK(bp);
+	*cblock = *block;
+	INT_MOD(block->bb_level, ARCH_CONVERT, +1);
+	INT_SET(block->bb_numrecs, ARCH_CONVERT, 1);
+	cur->bc_nlevels++;
+	cur->bc_ptrs[level + 1] = 1;
+	kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
+	ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
+	bcopy(kp, ckp, INT_GET(cblock->bb_numrecs, ARCH_CONVERT) * sizeof(*kp));
+	cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
+#ifdef DEBUG
+	for (i = 0; i < INT_GET(cblock->bb_numrecs, ARCH_CONVERT); i++) {
+		if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) {
+			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+			return error;
+		}
+	}
+#endif
+	bcopy(pp, cpp, INT_GET(cblock->bb_numrecs, ARCH_CONVERT) * sizeof(*pp));
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)args.fsbno,
+			level))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+#endif
+	INT_SET(*pp, ARCH_CONVERT, args.fsbno);
+	xfs_iroot_realloc(cur->bc_private.b.ip, 1 - INT_GET(cblock->bb_numrecs, ARCH_CONVERT),
+		cur->bc_private.b.whichfork);
+	xfs_btree_setbuf(cur, level, bp);
+	/*
+	 * Do all this logging at the end so that
+	 * the root is at the right level.
+	 */
+	xfs_bmbt_log_block(cur, bp, XFS_BB_ALL_BITS);
+	xfs_bmbt_log_keys(cur, bp, 1, INT_GET(cblock->bb_numrecs, ARCH_CONVERT));
+	xfs_bmbt_log_ptrs(cur, bp, 1, INT_GET(cblock->bb_numrecs, ARCH_CONVERT));
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	*logflags |=
+		XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork);
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Set all the fields in a bmap extent record from the uncompressed form.
+ */
+void
+xfs_bmbt_set_all(
+	xfs_bmbt_rec_t	*r,
+	xfs_bmbt_irec_t *s)
+{
+	int	extent_flag;
+
+	ASSERT((s->br_state == XFS_EXT_NORM) ||
+		(s->br_state == XFS_EXT_UNWRITTEN));
+	extent_flag = (s->br_state == XFS_EXT_NORM) ? 0 : 1;
+	ASSERT((s->br_startoff & XFS_MASK64HI(9)) == 0);
+	ASSERT((s->br_blockcount & XFS_MASK64HI(43)) == 0);
+#if XFS_BIG_FILESYSTEMS
+	ASSERT((s->br_startblock & XFS_MASK64HI(12)) == 0);
+#endif	/* XFS_BIG_FILESYSTEMS */
+#if XFS_BIG_FILESYSTEMS
+	INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+		  ((xfs_bmbt_rec_base_t)s->br_startoff << 9) |
+		  ((xfs_bmbt_rec_base_t)s->br_startblock >> 43));
+	INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
+		  ((xfs_bmbt_rec_base_t)s->br_blockcount &
+		   (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+#else	/* !XFS_BIG_FILESYSTEMS */
+	if (ISNULLSTARTBLOCK(s->br_startblock)) {
+		INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+			((xfs_bmbt_rec_base_t)s->br_startoff << 9) |
+			  (xfs_bmbt_rec_base_t)XFS_MASK64LO(9));
+		INT_SET(r->l1, ARCH_CONVERT, XFS_MASK64HI(11) |
+			  ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
+			  ((xfs_bmbt_rec_base_t)s->br_blockcount &
+			   (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+	} else {
+		INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+			((xfs_bmbt_rec_base_t)s->br_startoff << 9));
+		INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)s->br_startblock << 21) |
+			  ((xfs_bmbt_rec_base_t)s->br_blockcount &
+			   (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+	}
+#endif	/* XFS_BIG_FILESYSTEMS */
+}
+
+/*
+ * Set all the fields in a bmap extent record from the arguments.
+ */
+void
+xfs_bmbt_set_allf(
+	xfs_bmbt_rec_t	*r,
+	xfs_fileoff_t	o,
+	xfs_fsblock_t	b,
+	xfs_filblks_t	c,
+	xfs_exntst_t	v)
+{
+	int	extent_flag;
+
+	ASSERT((v == XFS_EXT_NORM) || (v == XFS_EXT_UNWRITTEN));
+	extent_flag = (v == XFS_EXT_NORM) ? 0 : 1;
+	ASSERT((o & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0);
+	ASSERT((c & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
+#if XFS_BIG_FILESYSTEMS
+	ASSERT((b & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+#endif	/* XFS_BIG_FILESYSTEMS */
+#if XFS_BIG_FILESYSTEMS
+	INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+		((xfs_bmbt_rec_base_t)o << 9) |
+		((xfs_bmbt_rec_base_t)b >> 43));
+	INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)b << 21) |
+		  ((xfs_bmbt_rec_base_t)c &
+		   (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+#else	/* !XFS_BIG_FILESYSTEMS */
+	if (ISNULLSTARTBLOCK(b)) {
+		INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+			((xfs_bmbt_rec_base_t)o << 9) |
+			 (xfs_bmbt_rec_base_t)XFS_MASK64LO(9));
+		INT_SET(r->l1, ARCH_CONVERT, XFS_MASK64HI(11) |
+			  ((xfs_bmbt_rec_base_t)b << 21) |
+			  ((xfs_bmbt_rec_base_t)c &
+			   (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+	} else {
+		INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) |
+			((xfs_bmbt_rec_base_t)o << 9));
+		INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)b << 21) |
+			  ((xfs_bmbt_rec_base_t)c &
+			   (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+	}
+#endif	/* XFS_BIG_FILESYSTEMS */
+}
+
+/*
+ * Set the blockcount field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_blockcount(
+	xfs_bmbt_rec_t	*r,
+	xfs_filblks_t	v)
+{
+	ASSERT((v & XFS_MASK64HI(43)) == 0);
+	INT_SET(r->l1, ARCH_CONVERT, (INT_GET(r->l1, ARCH_CONVERT) & (xfs_bmbt_rec_base_t)XFS_MASK64HI(43)) |
+		  (xfs_bmbt_rec_base_t)(v & XFS_MASK64LO(21)));
+}
+
+/*
+ * Set the startblock field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_startblock(
+	xfs_bmbt_rec_t	*r,
+	xfs_fsblock_t	v)
+{
+#if XFS_BIG_FILESYSTEMS
+	ASSERT((v & XFS_MASK64HI(12)) == 0);
+#endif	/* XFS_BIG_FILESYSTEMS */
+#if XFS_BIG_FILESYSTEMS
+	INT_SET(r->l0, ARCH_CONVERT, (INT_GET(r->l0, ARCH_CONVERT) & (xfs_bmbt_rec_base_t)XFS_MASK64HI(55)) |
+		  (xfs_bmbt_rec_base_t)(v >> 43));
+	INT_SET(r->l1, ARCH_CONVERT, (INT_GET(r->l1, ARCH_CONVERT) & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)) |
+		  (xfs_bmbt_rec_base_t)(v << 21));
+#else	/* !XFS_BIG_FILESYSTEMS */
+	if (ISNULLSTARTBLOCK(v)) {
+		INT_SET(r->l0, ARCH_CONVERT, (INT_GET(r->l0, ARCH_CONVERT) | (xfs_bmbt_rec_base_t)XFS_MASK64LO(9)));
+		INT_SET(r->l1, ARCH_CONVERT, (xfs_bmbt_rec_base_t)XFS_MASK64HI(11) |
+			  ((xfs_bmbt_rec_base_t)v << 21) |
+			  (INT_GET(r->l1, ARCH_CONVERT) & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+	} else {
+		INT_SET(r->l0, ARCH_CONVERT, (INT_GET(r->l0, ARCH_CONVERT) & ~(xfs_bmbt_rec_base_t)XFS_MASK64LO(9)));
+		INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)v << 21) |
+			  (INT_GET(r->l1, ARCH_CONVERT) & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+	}
+#endif	/* XFS_BIG_FILESYSTEMS */
+}
+
+/*
+ * Set the startoff field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_startoff(
+	xfs_bmbt_rec_t	*r,
+	xfs_fileoff_t	v)
+{
+	ASSERT((v & XFS_MASK64HI(9)) == 0);
+	INT_SET(r->l0, ARCH_CONVERT, (INT_GET(r->l0, ARCH_CONVERT) & (xfs_bmbt_rec_base_t) XFS_MASK64HI(1)) |
+		((xfs_bmbt_rec_base_t)v << 9) |
+		  (INT_GET(r->l0, ARCH_CONVERT) & (xfs_bmbt_rec_base_t)XFS_MASK64LO(9)));
+}
+
+/*
+ * Set the extent state field in a bmap extent record.
+ */
+void
+xfs_bmbt_set_state(
+	xfs_bmbt_rec_t	*r,
+	xfs_exntst_t	v)
+{
+	ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN);
+	if (v == XFS_EXT_NORM)
+		INT_SET(r->l0, ARCH_CONVERT, INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN));
+	else
+		INT_SET(r->l0, ARCH_CONVERT, INT_GET(r->l0, ARCH_CONVERT) | XFS_MASK64HI(BMBT_EXNTFLAG_BITLEN));
+}
+
+/*
+ * Convert in-memory form of btree root to on-disk form.
+ */
+void
+xfs_bmbt_to_bmdr(
+	xfs_bmbt_block_t	*rblock,
+	int			rblocklen,
+	xfs_bmdr_block_t	*dblock,
+	int			dblocklen)
+{
+	int			dmxr;
+	xfs_bmbt_key_t		*fkp;
+	xfs_bmbt_ptr_t		*fpp;
+	xfs_bmbt_key_t		*tkp;
+	xfs_bmbt_ptr_t		*tpp;
+
+	ASSERT(INT_GET(rblock->bb_magic, ARCH_CONVERT) == XFS_BMAP_MAGIC);
+	ASSERT(INT_GET(rblock->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO);
+	ASSERT(INT_GET(rblock->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO);
+	ASSERT(INT_GET(rblock->bb_level, ARCH_CONVERT) > 0);
+	dblock->bb_level = rblock->bb_level;	/* both in on-disk format */
+	dblock->bb_numrecs = rblock->bb_numrecs;/* both in on-disk format */
+	dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0);
+	fkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen);
+	tkp = XFS_BTREE_KEY_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr);
+	fpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
+	tpp = XFS_BTREE_PTR_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr);
+	dmxr = INT_GET(dblock->bb_numrecs, ARCH_CONVERT);
+	bcopy(fkp, tkp, sizeof(*fkp) * dmxr);
+	bcopy(fpp, tpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */
+}
+
+/*
+ * Update the record to the passed values.
+ */
+int
+xfs_bmbt_update(
+	xfs_btree_cur_t		*cur,
+	xfs_fileoff_t		off,
+	xfs_fsblock_t		bno,
+	xfs_filblks_t		len,
+	xfs_exntst_t		state)
+{
+	xfs_bmbt_block_t	*block;
+	xfs_buf_t		*bp;
+	int			error;
+#ifdef XFS_BMBT_TRACE
+	static char		fname[] = "xfs_bmbt_update";
+#endif
+	xfs_bmbt_key_t		key;
+	int			ptr;
+	xfs_bmbt_rec_t		*rp;
+
+	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
+	XFS_BMBT_TRACE_ARGFFFI(cur, (xfs_dfiloff_t)off, (xfs_dfsbno_t)bno,
+		(xfs_dfilblks_t)len, (int)state);
+	block = xfs_bmbt_get_block(cur, 0, &bp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_lblock(cur, block, 0, bp))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+#endif
+	ptr = cur->bc_ptrs[0];
+	rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
+	xfs_bmbt_set_allf(rp, off, bno, len, state);
+	xfs_bmbt_log_recs(cur, bp, ptr, ptr);
+	if (ptr > 1) {
+		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+		return 0;
+	}
+	INT_SET(key.br_startoff, ARCH_CONVERT, off);
+	if ((error = xfs_bmbt_updkey(cur, &key, 1))) {
+		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+		return error;
+	}
+	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
+	return 0;
+}
+
+/*
+ * Check an extent list, which has just been read, for
+ * any bit in the extent flag field. ASSERT on debug
+ * kernels, as this condition should not occur.
+ * Return an error condition (1) if any flags found,
+ * otherwise return 0.
+ */
+int
+xfs_check_nostate_extents(
+	xfs_bmbt_rec_t		*ep,
+	xfs_extnum_t		num)
+{
+	for (; num > 0; num--, ep++) {
+		if (((INT_GET(ep->l0, ARCH_CONVERT)) >>
+		     (64 - BMBT_EXNTFLAG_BITLEN)) != 0) {
+			ASSERT(0);
+			return 1;
+		}
+	}
+	return 0;
+}
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
new file mode 100644
index 000000000000..a9ec9c58252d
--- /dev/null
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -0,0 +1,656 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_BMAP_BTREE_H__
+#define __XFS_BMAP_BTREE_H__
+
+#define XFS_BMAP_MAGIC	0x424d4150	/* 'BMAP' */
+
+struct xfs_btree_cur;
+struct xfs_btree_lblock;
+struct xfs_mount;
+struct xfs_inode;
+
+/*
+ * Bmap root header, on-disk form only.
+ */
+typedef struct xfs_bmdr_block
+{
+	__uint16_t	bb_level;	/* 0 is a leaf */
+	__uint16_t	bb_numrecs;	/* current # of data records */
+} xfs_bmdr_block_t;
+
+/*
+ * Bmap btree record and extent descriptor.
+ * For 32-bit kernels,
+ *  l0:31 is an extent flag (value 1 indicates non-normal).
+ *  l0:0-30 and l1:9-31 are startoff.
+ *  l1:0-8, l2:0-31, and l3:21-31 are startblock.
+ *  l3:0-20 are blockcount.
+ * For 64-bit kernels,
+ *  l0:63 is an extent flag (value 1 indicates non-normal).
+ *  l0:9-62 are startoff.
+ *  l0:0-8 and l1:21-63 are startblock.
+ *  l1:0-20 are blockcount.
+ */
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+
+#define BMBT_TOTAL_BITLEN	128	/* 128 bits, 16 bytes */
+#define BMBT_EXNTFLAG_BITOFF	0
+#define BMBT_EXNTFLAG_BITLEN	1
+#define BMBT_STARTOFF_BITOFF	(BMBT_EXNTFLAG_BITOFF + BMBT_EXNTFLAG_BITLEN)
+#define BMBT_STARTOFF_BITLEN	54
+#define BMBT_STARTBLOCK_BITOFF	(BMBT_STARTOFF_BITOFF + BMBT_STARTOFF_BITLEN)
+#define BMBT_STARTBLOCK_BITLEN	52
+#define BMBT_BLOCKCOUNT_BITOFF	\
+	(BMBT_STARTBLOCK_BITOFF + BMBT_STARTBLOCK_BITLEN)
+#define BMBT_BLOCKCOUNT_BITLEN	(BMBT_TOTAL_BITLEN - BMBT_BLOCKCOUNT_BITOFF)
+
+#else
+
+#define BMBT_TOTAL_BITLEN	128	/* 128 bits, 16 bytes */
+#define BMBT_EXNTFLAG_BITOFF	63
+#define BMBT_EXNTFLAG_BITLEN	1
+#define BMBT_STARTOFF_BITOFF	(BMBT_EXNTFLAG_BITOFF - BMBT_STARTOFF_BITLEN)
+#define BMBT_STARTOFF_BITLEN	54
+#define BMBT_STARTBLOCK_BITOFF	85 /* 128 - 43 (other 9 is in first word) */
+#define BMBT_STARTBLOCK_BITLEN	52
+#define BMBT_BLOCKCOUNT_BITOFF	64 /* Start of second 64 bit container */
+#define BMBT_BLOCKCOUNT_BITLEN	21
+
+#endif
+
+
+#define BMBT_USE_64	1
+
+typedef struct xfs_bmbt_rec_32
+{
+	__uint32_t		l0, l1, l2, l3;
+} xfs_bmbt_rec_32_t;
+typedef struct xfs_bmbt_rec_64
+{
+	__uint64_t		l0, l1;
+} xfs_bmbt_rec_64_t;
+
+typedef __uint64_t	xfs_bmbt_rec_base_t;	/* use this for casts */
+typedef xfs_bmbt_rec_64_t xfs_bmbt_rec_t, xfs_bmdr_rec_t;
+
+/*
+ * Values and macros for delayed-allocation startblock fields.
+ */
+#define STARTBLOCKVALBITS	17
+#define STARTBLOCKMASKBITS	(15 + XFS_BIG_FILESYSTEMS * 20)
+#define DSTARTBLOCKMASKBITS	(15 + 20)
+#define STARTBLOCKMASK		\
+	(((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+#define DSTARTBLOCKMASK		\
+	(((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_ISNULLSTARTBLOCK)
+int isnullstartblock(xfs_fsblock_t x);
+#define ISNULLSTARTBLOCK(x)	isnullstartblock(x)
+#else
+#define ISNULLSTARTBLOCK(x)	(((x) & STARTBLOCKMASK) == STARTBLOCKMASK)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_ISNULLDSTARTBLOCK)
+int isnulldstartblock(xfs_dfsbno_t x);
+#define ISNULLDSTARTBLOCK(x)	isnulldstartblock(x)
+#else
+#define ISNULLDSTARTBLOCK(x)	(((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_NULLSTARTBLOCK)
+xfs_fsblock_t nullstartblock(int k);
+#define NULLSTARTBLOCK(k)	nullstartblock(k)
+#else
+#define NULLSTARTBLOCK(k)	\
+	((ASSERT(k < (1 << STARTBLOCKVALBITS))), (STARTBLOCKMASK | (k)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_STARTBLOCKVAL)
+xfs_filblks_t startblockval(xfs_fsblock_t x);
+#define STARTBLOCKVAL(x)	startblockval(x)
+#else
+#define STARTBLOCKVAL(x)	((xfs_filblks_t)((x) & ~STARTBLOCKMASK))
+#endif
+
+/*
+ * Possible extent formats.
+ */
+typedef enum {
+	XFS_EXTFMT_NOSTATE = 0,
+	XFS_EXTFMT_HASSTATE
+} xfs_exntfmt_t;
+
+/*
+ * Possible extent states.
+ */
+typedef enum {
+	XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
+	XFS_EXT_DMAPI_OFFLINE
+} xfs_exntst_t;
+
+/*
+ * Extent state and extent format macros.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_EXTFMT_INODE )
+xfs_exntfmt_t xfs_extfmt_inode(struct xfs_inode *ip);
+#define XFS_EXTFMT_INODE(x)	xfs_extfmt_inode(x)
+#else
+#define XFS_EXTFMT_INODE(x) \
+  (XFS_SB_VERSION_HASEXTFLGBIT(&((x)->i_mount->m_sb)) ? \
+	XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE)
+#endif
+#define ISUNWRITTEN(x)		((x) == XFS_EXT_UNWRITTEN)
+
+/*
+ * Incore version of above.
+ */
+typedef struct xfs_bmbt_irec
+{
+	xfs_fileoff_t	br_startoff;	/* starting file offset */
+	xfs_fsblock_t	br_startblock;	/* starting block number */
+	xfs_filblks_t	br_blockcount;	/* number of blocks */
+	xfs_exntst_t	br_state;	/* extent state */
+} xfs_bmbt_irec_t;
+
+/*
+ * Key structure for non-leaf levels of the tree.
+ */
+typedef struct xfs_bmbt_key
+{
+	xfs_dfiloff_t	br_startoff;	/* starting file offset */
+} xfs_bmbt_key_t, xfs_bmdr_key_t;
+
+typedef xfs_dfsbno_t xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;	/* btree pointer type */
+					/* btree block header type */
+typedef struct xfs_btree_lblock xfs_bmbt_block_t;
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_BMBT_BLOCK)
+xfs_bmbt_block_t *xfs_buf_to_bmbt_block(struct xfs_buf *bp);
+#define XFS_BUF_TO_BMBT_BLOCK(bp)		xfs_buf_to_bmbt_block(bp)
+#else
+#define XFS_BUF_TO_BMBT_BLOCK(bp) ((xfs_bmbt_block_t *)(XFS_BUF_PTR(bp)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_RBLOCK_DSIZE)
+int xfs_bmap_rblock_dsize(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_RBLOCK_DSIZE(lev,cur)		xfs_bmap_rblock_dsize(lev,cur)
+#else
+#define XFS_BMAP_RBLOCK_DSIZE(lev,cur) ((cur)->bc_private.b.forksize)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_RBLOCK_ISIZE)
+int xfs_bmap_rblock_isize(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_RBLOCK_ISIZE(lev,cur)		xfs_bmap_rblock_isize(lev,cur)
+#else
+#define XFS_BMAP_RBLOCK_ISIZE(lev,cur) \
+	((int)XFS_IFORK_PTR((cur)->bc_private.b.ip, \
+			    (cur)->bc_private.b.whichfork)->if_broot_bytes)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_IBLOCK_SIZE)
+int xfs_bmap_iblock_size(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_IBLOCK_SIZE(lev,cur)		xfs_bmap_iblock_size(lev,cur)
+#else
+#define XFS_BMAP_IBLOCK_SIZE(lev,cur) (1 << (cur)->bc_blocklog)
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_DSIZE)
+int xfs_bmap_block_dsize(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_BLOCK_DSIZE(lev,cur)		xfs_bmap_block_dsize(lev,cur)
+#else
+#define XFS_BMAP_BLOCK_DSIZE(lev,cur) \
+	((lev) == (cur)->bc_nlevels - 1 ? \
+		XFS_BMAP_RBLOCK_DSIZE(lev,cur) : \
+		XFS_BMAP_IBLOCK_SIZE(lev,cur))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_ISIZE)
+int xfs_bmap_block_isize(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_BLOCK_ISIZE(lev,cur)		xfs_bmap_block_isize(lev,cur)
+#else
+#define XFS_BMAP_BLOCK_ISIZE(lev,cur) \
+	((lev) == (cur)->bc_nlevels - 1 ? \
+		XFS_BMAP_RBLOCK_ISIZE(lev,cur) : \
+		XFS_BMAP_IBLOCK_SIZE(lev,cur))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_DMAXRECS)
+int xfs_bmap_block_dmaxrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_BLOCK_DMAXRECS(lev,cur)	xfs_bmap_block_dmaxrecs(lev,cur)
+#else
+#define XFS_BMAP_BLOCK_DMAXRECS(lev,cur) \
+	((lev) == (cur)->bc_nlevels - 1 ? \
+		XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \
+			xfs_bmdr, (lev) == 0) : \
+		((cur)->bc_mp->m_bmap_dmxr[(lev) != 0]))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_IMAXRECS)
+int xfs_bmap_block_imaxrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_BLOCK_IMAXRECS(lev,cur)	xfs_bmap_block_imaxrecs(lev,cur)
+#else
+#define XFS_BMAP_BLOCK_IMAXRECS(lev,cur) \
+	((lev) == (cur)->bc_nlevels - 1 ? \
+		XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur), \
+			xfs_bmbt, (lev) == 0) : \
+		((cur)->bc_mp->m_bmap_dmxr[(lev) != 0]))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_DMINRECS)
+int xfs_bmap_block_dminrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_BLOCK_DMINRECS(lev,cur)	xfs_bmap_block_dminrecs(lev,cur)
+#else
+#define XFS_BMAP_BLOCK_DMINRECS(lev,cur) \
+	((lev) == (cur)->bc_nlevels - 1 ? \
+		XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \
+			xfs_bmdr, (lev) == 0) : \
+		((cur)->bc_mp->m_bmap_dmnr[(lev) != 0]))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BLOCK_IMINRECS)
+int xfs_bmap_block_iminrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_BMAP_BLOCK_IMINRECS(lev,cur)	xfs_bmap_block_iminrecs(lev,cur)
+#else
+#define XFS_BMAP_BLOCK_IMINRECS(lev,cur) \
+	((lev) == (cur)->bc_nlevels - 1 ? \
+		XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur), \
+			xfs_bmbt, (lev) == 0) : \
+		((cur)->bc_mp->m_bmap_dmnr[(lev) != 0]))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_REC_DADDR)
+xfs_bmbt_rec_t *
+xfs_bmap_rec_daddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_BMAP_REC_DADDR(bb,i,cur)		xfs_bmap_rec_daddr(bb,i,cur)
+#else
+#define XFS_BMAP_REC_DADDR(bb,i,cur) \
+	XFS_BTREE_REC_ADDR(XFS_BMAP_BLOCK_DSIZE(		\
+		INT_GET((bb)->bb_level, ARCH_CONVERT), cur),	\
+		xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS(	\
+			INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_REC_IADDR)
+xfs_bmbt_rec_t *
+xfs_bmap_rec_iaddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_BMAP_REC_IADDR(bb,i,cur)		xfs_bmap_rec_iaddr(bb,i,cur)
+#else
+#define XFS_BMAP_REC_IADDR(bb,i,cur) \
+	XFS_BTREE_REC_ADDR(XFS_BMAP_BLOCK_ISIZE(		\
+		INT_GET((bb)->bb_level, ARCH_CONVERT), cur),	\
+		xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS(	\
+			INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_KEY_DADDR)
+xfs_bmbt_key_t *
+xfs_bmap_key_daddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_BMAP_KEY_DADDR(bb,i,cur)		xfs_bmap_key_daddr(bb,i,cur)
+#else
+#define XFS_BMAP_KEY_DADDR(bb,i,cur) \
+	XFS_BTREE_KEY_ADDR(XFS_BMAP_BLOCK_DSIZE(		\
+		INT_GET((bb)->bb_level, ARCH_CONVERT), cur),	\
+		xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS(	\
+			INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_KEY_IADDR)
+xfs_bmbt_key_t *
+xfs_bmap_key_iaddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_BMAP_KEY_IADDR(bb,i,cur)		xfs_bmap_key_iaddr(bb,i,cur)
+#else
+#define XFS_BMAP_KEY_IADDR(bb,i,cur) \
+	XFS_BTREE_KEY_ADDR(XFS_BMAP_BLOCK_ISIZE(		\
+		INT_GET((bb)->bb_level, ARCH_CONVERT), cur),	\
+		xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS(	\
+			INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_PTR_DADDR)
+xfs_bmbt_ptr_t *
+xfs_bmap_ptr_daddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_BMAP_PTR_DADDR(bb,i,cur)		xfs_bmap_ptr_daddr(bb,i,cur)
+#else
+#define XFS_BMAP_PTR_DADDR(bb,i,cur) \
+	XFS_BTREE_PTR_ADDR(XFS_BMAP_BLOCK_DSIZE(		\
+		INT_GET((bb)->bb_level, ARCH_CONVERT), cur),	\
+		xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS(	\
+			INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_PTR_IADDR)
+xfs_bmbt_ptr_t *
+xfs_bmap_ptr_iaddr(xfs_bmbt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_BMAP_PTR_IADDR(bb,i,cur)		xfs_bmap_ptr_iaddr(bb,i,cur)
+#else
+#define XFS_BMAP_PTR_IADDR(bb,i,cur) \
+	XFS_BTREE_PTR_ADDR(XFS_BMAP_BLOCK_ISIZE(		\
+		INT_GET((bb)->bb_level, ARCH_CONVERT), cur),	\
+		xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS(	\
+			INT_GET((bb)->bb_level, ARCH_CONVERT), cur))
+#endif
+
+/*
+ * These are to be used when we know the size of the block and
+ * we don't have a cursor.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_REC_ADDR)
+xfs_bmbt_rec_t *xfs_bmap_broot_rec_addr(xfs_bmbt_block_t *bb, int i, int sz);
+#define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz)	xfs_bmap_broot_rec_addr(bb,i,sz)
+#else
+#define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz) \
+	XFS_BTREE_REC_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_KEY_ADDR)
+xfs_bmbt_key_t *xfs_bmap_broot_key_addr(xfs_bmbt_block_t *bb, int i, int sz);
+#define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz)	xfs_bmap_broot_key_addr(bb,i,sz)
+#else
+#define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz) \
+	XFS_BTREE_KEY_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_PTR_ADDR)
+xfs_bmbt_ptr_t *xfs_bmap_broot_ptr_addr(xfs_bmbt_block_t *bb, int i, int sz);
+#define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz)	xfs_bmap_broot_ptr_addr(bb,i,sz)
+#else
+#define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz) \
+	XFS_BTREE_PTR_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_NUMRECS)
+int xfs_bmap_broot_numrecs(xfs_bmdr_block_t *bb);
+#define XFS_BMAP_BROOT_NUMRECS(bb)		xfs_bmap_broot_numrecs(bb)
+#else
+#define XFS_BMAP_BROOT_NUMRECS(bb) (INT_GET((bb)->bb_numrecs, ARCH_CONVERT))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_MAXRECS)
+int xfs_bmap_broot_maxrecs(int sz);
+#define XFS_BMAP_BROOT_MAXRECS(sz)		xfs_bmap_broot_maxrecs(sz)
+#else
+#define XFS_BMAP_BROOT_MAXRECS(sz) XFS_BTREE_BLOCK_MAXRECS(sz,xfs_bmbt,0)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_SPACE_CALC)
+int xfs_bmap_broot_space_calc(int nrecs);
+#define XFS_BMAP_BROOT_SPACE_CALC(nrecs)	xfs_bmap_broot_space_calc(nrecs)
+#else
+#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \
+	((int)(sizeof(xfs_bmbt_block_t) + \
+	       ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_BROOT_SPACE)
+int xfs_bmap_broot_space(xfs_bmdr_block_t *bb);
+#define XFS_BMAP_BROOT_SPACE(bb)		xfs_bmap_broot_space(bb)
+#else
+#define XFS_BMAP_BROOT_SPACE(bb) \
+	XFS_BMAP_BROOT_SPACE_CALC(INT_GET((bb)->bb_numrecs, ARCH_CONVERT))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMDR_SPACE_CALC)
+int xfs_bmdr_space_calc(int nrecs);
+#define XFS_BMDR_SPACE_CALC(nrecs)		xfs_bmdr_space_calc(nrecs)
+#else
+#define XFS_BMDR_SPACE_CALC(nrecs)	\
+	((int)(sizeof(xfs_bmdr_block_t) + \
+	       ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))))
+#endif
+
+/*
+ * Maximum number of bmap btree levels.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BM_MAXLEVELS)
+int xfs_bm_maxlevels(struct xfs_mount *mp, int w);
+#define XFS_BM_MAXLEVELS(mp,w)			xfs_bm_maxlevels(mp,w)
+#else
+#define XFS_BM_MAXLEVELS(mp,w)		((mp)->m_bm_maxlevels[w])
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BMAP_SANITY_CHECK)
+int xfs_bmap_sanity_check(struct xfs_mount *mp, xfs_bmbt_block_t *bb,
+	int level);
+#define XFS_BMAP_SANITY_CHECK(mp,bb,level)	\
+	xfs_bmap_sanity_check(mp,bb,level)
+#else
+#define XFS_BMAP_SANITY_CHECK(mp,bb,level)	\
+	(INT_GET((bb)->bb_magic, ARCH_CONVERT) == XFS_BMAP_MAGIC && \
+	 INT_GET((bb)->bb_level, ARCH_CONVERT) == level && \
+	 INT_GET((bb)->bb_numrecs, ARCH_CONVERT) > 0 && \
+	 INT_GET((bb)->bb_numrecs, ARCH_CONVERT) <= (mp)->m_bmap_dmxr[(level) != 0])
+#endif
+
+/*
+ * Trace buffer entry types.
+ */
+#define XFS_BMBT_KTRACE_ARGBI	1
+#define XFS_BMBT_KTRACE_ARGBII	2
+#define XFS_BMBT_KTRACE_ARGFFFI 3
+#define XFS_BMBT_KTRACE_ARGI	4
+#define XFS_BMBT_KTRACE_ARGIFK	5
+#define XFS_BMBT_KTRACE_ARGIFR	6
+#define XFS_BMBT_KTRACE_ARGIK	7
+#define XFS_BMBT_KTRACE_CUR	8
+
+#define XFS_BMBT_TRACE_SIZE	4096	/* size of global trace buffer */
+#define XFS_BMBT_KTRACE_SIZE	32	/* size of per-inode trace buffer */
+
+#if defined(XFS_ALL_TRACE)
+#define XFS_BMBT_TRACE
+#endif
+
+#if !defined(DEBUG)
+#undef XFS_BMBT_TRACE
+#endif
+
+
+/*
+ * Prototypes for xfs_bmap.c to call.
+ */
+
+void
+xfs_bmdr_to_bmbt(
+	xfs_bmdr_block_t *,
+	int,
+	xfs_bmbt_block_t *,
+	int);
+
+int
+xfs_bmbt_decrement(
+	struct xfs_btree_cur *,
+	int,
+	int *);
+
+int
+xfs_bmbt_delete(
+	struct xfs_btree_cur *,
+	int,
+	int *);
+
+void
+xfs_bmbt_get_all(
+	xfs_bmbt_rec_t	*r,
+	xfs_bmbt_irec_t *s);
+
+xfs_bmbt_block_t *
+xfs_bmbt_get_block(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	struct xfs_buf		**bpp);
+
+xfs_filblks_t
+xfs_bmbt_get_blockcount(
+	xfs_bmbt_rec_t	*r);
+
+xfs_fsblock_t
+xfs_bmbt_get_startblock(
+	xfs_bmbt_rec_t	*r);
+
+xfs_fileoff_t
+xfs_bmbt_get_startoff(
+	xfs_bmbt_rec_t	*r);
+
+xfs_exntst_t
+xfs_bmbt_get_state(
+	xfs_bmbt_rec_t	*r);
+
+int
+xfs_bmbt_increment(
+	struct xfs_btree_cur *,
+	int,
+	int *);
+
+int
+xfs_bmbt_insert(
+	struct xfs_btree_cur *,
+	int *);
+
+int
+xfs_bmbt_insert_many(
+	struct xfs_btree_cur *,
+	int,
+	xfs_bmbt_rec_t *,
+	int *);
+
+void
+xfs_bmbt_log_block(
+	struct xfs_btree_cur *,
+	struct xfs_buf *,
+	int);
+
+void
+xfs_bmbt_log_recs(
+	struct xfs_btree_cur *,
+	struct xfs_buf *,
+	int,
+	int);
+
+int
+xfs_bmbt_lookup_eq(
+	struct xfs_btree_cur *,
+	xfs_fileoff_t,
+	xfs_fsblock_t,
+	xfs_filblks_t,
+	int *);
+
+int
+xfs_bmbt_lookup_ge(
+	struct xfs_btree_cur *,
+	xfs_fileoff_t,
+	xfs_fsblock_t,
+	xfs_filblks_t,
+	int *);
+
+int
+xfs_bmbt_lookup_le(
+	struct xfs_btree_cur *,
+	xfs_fileoff_t,
+	xfs_fsblock_t,
+	xfs_filblks_t,
+	int *);
+
+/*
+ * Give the bmap btree a new root block.  Copy the old broot contents
+ * down into a real block and make the broot point to it.
+ */
+int						/* error */
+xfs_bmbt_newroot(
+	struct xfs_btree_cur	*cur,		/* btree cursor */
+	int			*logflags,	/* logging flags for inode */
+	int			*stat);		/* return status - 0 fail */
+
+void
+xfs_bmbt_set_all(
+	xfs_bmbt_rec_t	*r,
+	xfs_bmbt_irec_t *s);
+
+void
+xfs_bmbt_set_allf(
+	xfs_bmbt_rec_t	*r,
+	xfs_fileoff_t	o,
+	xfs_fsblock_t	b,
+	xfs_filblks_t	c,
+	xfs_exntst_t	v);
+
+void
+xfs_bmbt_set_blockcount(
+	xfs_bmbt_rec_t	*r,
+	xfs_filblks_t	v);
+
+void
+xfs_bmbt_set_startblock(
+	xfs_bmbt_rec_t	*r,
+	xfs_fsblock_t	v);
+
+void
+xfs_bmbt_set_startoff(
+	xfs_bmbt_rec_t	*r,
+	xfs_fileoff_t	v);
+
+void
+xfs_bmbt_set_state(
+	xfs_bmbt_rec_t	*r,
+	xfs_exntst_t	v);
+
+void
+xfs_bmbt_to_bmdr(
+	xfs_bmbt_block_t *,
+	int,
+	xfs_bmdr_block_t *,
+	int);
+
+int
+xfs_bmbt_update(
+	struct xfs_btree_cur *,
+	xfs_fileoff_t,
+	xfs_fsblock_t,
+	xfs_filblks_t,
+	xfs_exntst_t);
+
+#ifdef XFSDEBUG
+/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_bmbt_get_rec(
+	struct xfs_btree_cur *,
+	xfs_fileoff_t *,
+	xfs_fsblock_t *,
+	xfs_filblks_t *,
+	xfs_exntst_t *,
+	int *);
+#endif
+
+
+/*
+ * Search an extent list for the extent which includes block
+ * bno.
+ */
+xfs_bmbt_rec_t *
+xfs_bmap_do_search_extents(
+	xfs_bmbt_rec_t *,
+	xfs_extnum_t,
+	xfs_extnum_t,
+	xfs_fileoff_t,
+	int *,
+	xfs_extnum_t *,
+	xfs_bmbt_irec_t *,
+	xfs_bmbt_irec_t *);
+
+
+#endif	/* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
new file mode 100644
index 000000000000..7dcef68fb253
--- /dev/null
+++ b/fs/xfs/xfs_btree.c
@@ -0,0 +1,937 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * This file contains common code for the space manager's btree implementations.
+ */
+
+#include <xfs.h>
+
+/*
+ * Cursor allocation zone.
+ */
+kmem_zone_t	*xfs_btree_cur_zone;
+
+/*
+ * Btree magic numbers.
+ */
+const __uint32_t xfs_magics[XFS_BTNUM_MAX] =
+{
+	XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
+};
+
+/*
+ * Prototypes for internal routines.
+ */
+
+/*
+ * Checking routine: return maxrecs for the block.
+ */
+STATIC int				/* number of records fitting in block */
+xfs_btree_maxrecs(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_btree_block_t	*block);/* generic btree block pointer */
+
+/*
+ * Internal routines.
+ */
+
+/*
+ * Checking routine: return maxrecs for the block.
+ */
+STATIC int				/* number of records fitting in block */
+xfs_btree_maxrecs(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_btree_block_t	*block) /* generic btree block pointer */
+{
+	switch (cur->bc_btnum) {
+	case XFS_BTNUM_BNO:
+	case XFS_BTNUM_CNT:
+		return (int)XFS_ALLOC_BLOCK_MAXRECS(INT_GET(block->bb_h.bb_level, ARCH_CONVERT), cur);
+	case XFS_BTNUM_BMAP:
+		return (int)XFS_BMAP_BLOCK_IMAXRECS(INT_GET(block->bb_h.bb_level, ARCH_CONVERT), cur);
+	case XFS_BTNUM_INO:
+		return (int)XFS_INOBT_BLOCK_MAXRECS(INT_GET(block->bb_h.bb_level, ARCH_CONVERT), cur);
+	default:
+		ASSERT(0);
+		return 0;
+	}
+}
+
+/*
+ * External routines.
+ */
+
+#ifdef DEBUG
+/*
+ * Debug routine: check that block header is ok.
+ */
+void
+xfs_btree_check_block(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_btree_block_t	*block, /* generic btree block pointer */
+	int			level,	/* level of the btree block */
+	xfs_buf_t		*bp)	/* buffer containing block, if any */
+{
+	if (XFS_BTREE_LONG_PTRS(cur->bc_btnum))
+		xfs_btree_check_lblock(cur, (xfs_btree_lblock_t *)block, level,
+			bp);
+	else
+		xfs_btree_check_sblock(cur, (xfs_btree_sblock_t *)block, level,
+			bp);
+}
+
+/*
+ * Debug routine: check that keys are in the right order.
+ */
+void
+xfs_btree_check_key(
+	xfs_btnum_t	btnum,		/* btree identifier */
+	void		*ak1,		/* pointer to left (lower) key */
+	void		*ak2)		/* pointer to right (higher) key */
+{
+	switch (btnum) {
+	case XFS_BTNUM_BNO: {
+		xfs_alloc_key_t *k1;
+		xfs_alloc_key_t *k2;
+
+		k1 = ak1;
+		k2 = ak2;
+		ASSERT(INT_GET(k1->ar_startblock, ARCH_CONVERT) < INT_GET(k2->ar_startblock, ARCH_CONVERT));
+		break;
+	    }
+	case XFS_BTNUM_CNT: {
+		xfs_alloc_key_t *k1;
+		xfs_alloc_key_t *k2;
+
+		k1 = ak1;
+		k2 = ak2;
+		ASSERT(INT_GET(k1->ar_blockcount, ARCH_CONVERT) < INT_GET(k2->ar_blockcount, ARCH_CONVERT) ||
+		       (INT_GET(k1->ar_blockcount, ARCH_CONVERT) == INT_GET(k2->ar_blockcount, ARCH_CONVERT) &&
+			INT_GET(k1->ar_startblock, ARCH_CONVERT) < INT_GET(k2->ar_startblock, ARCH_CONVERT)));
+		break;
+	    }
+	case XFS_BTNUM_BMAP: {
+		xfs_bmbt_key_t	*k1;
+		xfs_bmbt_key_t	*k2;
+
+		k1 = ak1;
+		k2 = ak2;
+		ASSERT(INT_GET(k1->br_startoff, ARCH_CONVERT) < INT_GET(k2->br_startoff, ARCH_CONVERT));
+		break;
+	    }
+	case XFS_BTNUM_INO: {
+		xfs_inobt_key_t *k1;
+		xfs_inobt_key_t *k2;
+
+		k1 = ak1;
+		k2 = ak2;
+		ASSERT(INT_GET(k1->ir_startino, ARCH_CONVERT) < INT_GET(k2->ir_startino, ARCH_CONVERT));
+		break;
+	    }
+	default:
+		ASSERT(0);
+	}
+}
+#endif	/* DEBUG */
+
+/*
+ * Checking routine: check that long form block header is ok.
+ */
+/* ARGSUSED */
+int					/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lblock(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_btree_lblock_t	*block, /* btree long form block pointer */
+	int			level,	/* level of the btree block */
+	xfs_buf_t		*bp)	/* buffer for block, if any */
+{
+	int			lblock_ok; /* block passes checks */
+	xfs_mount_t		*mp;	/* file system mount point */
+
+	mp = cur->bc_mp;
+	lblock_ok =
+		INT_GET(block->bb_magic, ARCH_CONVERT) == xfs_magics[cur->bc_btnum] &&
+		INT_GET(block->bb_level, ARCH_CONVERT) == level &&
+		INT_GET(block->bb_numrecs, ARCH_CONVERT) <=
+			xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) &&
+		!INT_ISZERO(block->bb_leftsib, ARCH_CONVERT) &&
+		(INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLDFSBNO ||
+		 XFS_FSB_SANITY_CHECK(mp, INT_GET(block->bb_leftsib, ARCH_CONVERT))) &&
+		!INT_ISZERO(block->bb_rightsib, ARCH_CONVERT) &&
+		(INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLDFSBNO ||
+		 XFS_FSB_SANITY_CHECK(mp, INT_GET(block->bb_rightsib, ARCH_CONVERT)));
+	if (XFS_TEST_ERROR(!lblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK,
+			XFS_RANDOM_BTREE_CHECK_LBLOCK)) {
+		if (bp)
+			xfs_buftrace("LBTREE ERROR", bp);
+#ifdef __KERNEL__	/* additional, temporary, debugging code */
+		cmn_err(CE_NOTE,
+			"EFSCORRUPTED returned from file %s line %d",
+			__FILE__, __LINE__);
+#endif
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	return 0;
+}
+
+/*
+ * Checking routine: check that (long) pointer is ok.
+ */
+int					/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	xfs_dfsbno_t	ptr,		/* btree block disk address */
+	int		level)		/* btree block level */
+{
+	xfs_mount_t	*mp;		/* file system mount point */
+
+	mp = cur->bc_mp;
+	XFS_WANT_CORRUPTED_RETURN(
+		level > 0 &&
+		ptr != NULLDFSBNO &&
+		XFS_FSB_SANITY_CHECK(mp, ptr));
+	return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Debug routine: check that records are in the right order.
+ */
+void
+xfs_btree_check_rec(
+	xfs_btnum_t	btnum,		/* btree identifier */
+	void		*ar1,		/* pointer to left (lower) record */
+	void		*ar2)		/* pointer to right (higher) record */
+{
+	switch (btnum) {
+	case XFS_BTNUM_BNO: {
+		xfs_alloc_rec_t *r1;
+		xfs_alloc_rec_t *r2;
+
+		r1 = ar1;
+		r2 = ar2;
+		ASSERT(INT_GET(r1->ar_startblock, ARCH_CONVERT) + INT_GET(r1->ar_blockcount, ARCH_CONVERT) <=
+		       INT_GET(r2->ar_startblock, ARCH_CONVERT));
+		break;
+	    }
+	case XFS_BTNUM_CNT: {
+		xfs_alloc_rec_t *r1;
+		xfs_alloc_rec_t *r2;
+
+		r1 = ar1;
+		r2 = ar2;
+		ASSERT(INT_GET(r1->ar_blockcount, ARCH_CONVERT) < INT_GET(r2->ar_blockcount, ARCH_CONVERT) ||
+		       (INT_GET(r1->ar_blockcount, ARCH_CONVERT) == INT_GET(r2->ar_blockcount, ARCH_CONVERT) &&
+			INT_GET(r1->ar_startblock, ARCH_CONVERT) < INT_GET(r2->ar_startblock, ARCH_CONVERT)));
+		break;
+	    }
+	case XFS_BTNUM_BMAP: {
+		xfs_bmbt_rec_t	*r1;
+		xfs_bmbt_rec_t	*r2;
+
+		r1 = ar1;
+		r2 = ar2;
+		ASSERT(xfs_bmbt_get_startoff(r1) +
+		       xfs_bmbt_get_blockcount(r1) <=
+		       xfs_bmbt_get_startoff(r2));
+		break;
+	    }
+	case XFS_BTNUM_INO: {
+		xfs_inobt_rec_t *r1;
+		xfs_inobt_rec_t *r2;
+
+		r1 = ar1;
+		r2 = ar2;
+		ASSERT(INT_GET(r1->ir_startino, ARCH_CONVERT) + XFS_INODES_PER_CHUNK <=
+		       INT_GET(r2->ir_startino, ARCH_CONVERT));
+		break;
+	    }
+	default:
+		ASSERT(0);
+	}
+}
+#endif	/* DEBUG */
+
+/*
+ * Checking routine: check that block header is ok.
+ */
+/* ARGSUSED */
+int					/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sblock(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_btree_sblock_t	*block, /* btree short form block pointer */
+	int			level,	/* level of the btree block */
+	xfs_buf_t		*bp)	/* buffer containing block */
+{
+	xfs_buf_t		*agbp;	/* buffer for ag. freespace struct */
+	xfs_agf_t		*agf;	/* ag. freespace structure */
+	xfs_agblock_t		agflen; /* native ag. freespace length */
+	int			sblock_ok; /* block passes checks */
+
+	agbp = cur->bc_private.a.agbp;
+	agf = XFS_BUF_TO_AGF(agbp);
+	agflen = INT_GET(agf->agf_length, ARCH_CONVERT);
+	sblock_ok =
+		INT_GET(block->bb_magic, ARCH_CONVERT) == xfs_magics[cur->bc_btnum] &&
+		INT_GET(block->bb_level, ARCH_CONVERT) == level &&
+		INT_GET(block->bb_numrecs, ARCH_CONVERT) <=
+			xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) &&
+		(INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK ||
+		 INT_GET(block->bb_leftsib, ARCH_CONVERT) < agflen) &&
+		!INT_ISZERO(block->bb_leftsib, ARCH_CONVERT) &&
+		(INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK ||
+		 INT_GET(block->bb_rightsib, ARCH_CONVERT) < agflen) &&
+		!INT_ISZERO(block->bb_rightsib, ARCH_CONVERT);
+	if (XFS_TEST_ERROR(!sblock_ok, cur->bc_mp,
+			XFS_ERRTAG_BTREE_CHECK_SBLOCK,
+			XFS_RANDOM_BTREE_CHECK_SBLOCK)) {
+		if (bp)
+			xfs_buftrace("SBTREE ERROR", bp);
+#ifdef __KERNEL__	/* additional, temporary, debugging code */
+		cmn_err(CE_NOTE,
+			"xfs_btree_check_sblock: Not OK:");
+		cmn_err(CE_NOTE,
+			"magic 0x%x level %d numrecs %d leftsib %d rightsib %d",
+			INT_GET(block->bb_magic, ARCH_CONVERT),
+			INT_GET(block->bb_level, ARCH_CONVERT),
+			INT_GET(block->bb_numrecs, ARCH_CONVERT),
+			INT_GET(block->bb_leftsib, ARCH_CONVERT),
+			INT_GET(block->bb_rightsib, ARCH_CONVERT));
+#endif
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	return 0;
+}
+
+/*
+ * Checking routine: check that (short) pointer is ok.
+ */
+int					/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sptr(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	xfs_agblock_t	ptr,		/* btree block disk address */
+	int		level)		/* btree block level */
+{
+	xfs_buf_t	*agbp;		/* buffer for ag. freespace struct */
+	xfs_agf_t	*agf;		/* ag. freespace structure */
+
+	agbp = cur->bc_private.a.agbp;
+	agf = XFS_BUF_TO_AGF(agbp);
+	XFS_WANT_CORRUPTED_RETURN(
+		level > 0 &&
+		ptr != NULLAGBLOCK && ptr != 0 &&
+		ptr < INT_GET(agf->agf_length, ARCH_CONVERT));
+	return 0;
+}
+
+/*
+ * Delete the btree cursor.
+ */
+void
+xfs_btree_del_cursor(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	int		error)		/* del because of error */
+{
+	int		i;		/* btree level */
+
+	/*
+	 * Clear the buffer pointers, and release the buffers.
+	 * If we're doing this in the face of an error, we
+	 * need to make sure to inspect all of the entries
+	 * in the bc_bufs array for buffers to be unlocked.
+	 * This is because some of the btree code works from
+	 * level n down to 0, and if we get an error along
+	 * the way we won't have initialized all the entries
+	 * down to 0.
+	 */
+	for (i = 0; i < cur->bc_nlevels; i++) {
+		if (cur->bc_bufs[i])
+			xfs_btree_setbuf(cur, i, NULL);
+		else if (!error)
+			break;
+	}
+	/*
+	 * Can't free a bmap cursor without having dealt with the
+	 * allocated indirect blocks' accounting.
+	 */
+	ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
+	       cur->bc_private.b.allocated == 0);
+	/*
+	 * Free the cursor.
+	 */
+	kmem_zone_free(xfs_btree_cur_zone, cur);
+}
+
+/*
+ * Duplicate the btree cursor.
+ * Allocate a new one, copy the record, re-get the buffers.
+ */
+int					/* error */
+xfs_btree_dup_cursor(
+	xfs_btree_cur_t *cur,		/* input cursor */
+	xfs_btree_cur_t **ncur)		/* output cursor */
+{
+	xfs_buf_t	*bp;		/* btree block's buffer pointer */
+	int		error;		/* error return value */
+	int		i;		/* level number of btree block */
+	xfs_mount_t	*mp;		/* mount structure for filesystem */
+	xfs_btree_cur_t *new;		/* new cursor value */
+	xfs_trans_t	*tp;		/* transaction pointer, can be NULL */
+
+	tp = cur->bc_tp;
+	mp = cur->bc_mp;
+	/*
+	 * Allocate a new cursor like the old one.
+	 */
+	new = xfs_btree_init_cursor(mp, tp, cur->bc_private.a.agbp,
+		cur->bc_private.a.agno, cur->bc_btnum, cur->bc_private.b.ip,
+		cur->bc_private.b.whichfork);
+	/*
+	 * Copy the record currently in the cursor.
+	 */
+	new->bc_rec = cur->bc_rec;
+	/*
+	 * For each level current, re-get the buffer and copy the ptr value.
+	 */
+	for (i = 0; i < new->bc_nlevels; i++) {
+		new->bc_ptrs[i] = cur->bc_ptrs[i];
+		new->bc_ra[i] = cur->bc_ra[i];
+		if ((bp = cur->bc_bufs[i])) {
+			if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+				XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) {
+				xfs_btree_del_cursor(new, error);
+				*ncur = NULL;
+				return error;
+			}
+			new->bc_bufs[i] = bp;
+			ASSERT(bp);
+			ASSERT(!XFS_BUF_GETERROR(bp));
+		} else
+			new->bc_bufs[i] = NULL;
+	}
+	/*
+	 * For bmap btrees, copy the firstblock, flist, and flags values,
+	 * since init cursor doesn't get them.
+	 */
+	if (new->bc_btnum == XFS_BTNUM_BMAP) {
+		new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
+		new->bc_private.b.flist = cur->bc_private.b.flist;
+		new->bc_private.b.flags = cur->bc_private.b.flags;
+	}
+	*ncur = new;
+	return 0;
+}
+
+/*
+ * Change the cursor to point to the first record at the given level.
+ * Other levels are unaffected.
+ */
+int					/* success=1, failure=0 */
+xfs_btree_firstrec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level)	/* level to change */
+{
+	xfs_btree_block_t	*block; /* generic btree block pointer */
+	xfs_buf_t		*bp;	/* buffer containing block */
+
+	/*
+	 * Get the block pointer for this level.
+	 */
+	block = xfs_btree_get_block(cur, level, &bp);
+	xfs_btree_check_block(cur, block, level, bp);
+	/*
+	 * It's empty, there is no such record.
+	 */
+	if (INT_ISZERO(block->bb_h.bb_numrecs, ARCH_CONVERT))
+		return 0;
+	/*
+	 * Set the ptr value to 1, that's the first record/key.
+	 */
+	cur->bc_ptrs[level] = 1;
+	return 1;
+}
+
+/*
+ * Retrieve the block pointer from the cursor at the given level.
+ * This may be a bmap btree root or from a buffer.
+ */
+xfs_btree_block_t *			/* generic btree block pointer */
+xfs_btree_get_block(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level in btree */
+	xfs_buf_t		**bpp)	/* buffer containing the block */
+{
+	xfs_btree_block_t	*block; /* return value */
+	xfs_buf_t		*bp;	/* return buffer */
+	xfs_ifork_t		*ifp;	/* inode fork pointer */
+	int			whichfork; /* data or attr fork */
+
+	if (cur->bc_btnum == XFS_BTNUM_BMAP && level == cur->bc_nlevels - 1) {
+		whichfork = cur->bc_private.b.whichfork;
+		ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, whichfork);
+		block = (xfs_btree_block_t *)ifp->if_broot;
+		bp = NULL;
+	} else {
+		bp = cur->bc_bufs[level];
+		block = XFS_BUF_TO_BLOCK(bp);
+	}
+	ASSERT(block != NULL);
+	*bpp = bp;
+	return block;
+}
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Long-form addressing.
+ */
+xfs_buf_t *				/* buffer for fsbno */
+xfs_btree_get_bufl(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_fsblock_t	fsbno,		/* file system block number */
+	uint		lock)		/* lock flags for get_buf */
+{
+	xfs_buf_t	*bp;		/* buffer pointer (return value) */
+	xfs_daddr_t		d;		/* real disk block address */
+
+	ASSERT(fsbno != NULLFSBLOCK);
+	d = XFS_FSB_TO_DADDR(mp, fsbno);
+	bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+	ASSERT(bp);
+	ASSERT(!XFS_BUF_GETERROR(bp));
+	return bp;
+}
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Short-form addressing.
+ */
+xfs_buf_t *				/* buffer for agno/agbno */
+xfs_btree_get_bufs(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	xfs_agblock_t	agbno,		/* allocation group block number */
+	uint		lock)		/* lock flags for get_buf */
+{
+	xfs_buf_t	*bp;		/* buffer pointer (return value) */
+	xfs_daddr_t		d;		/* real disk block address */
+
+	ASSERT(agno != NULLAGNUMBER);
+	ASSERT(agbno != NULLAGBLOCK);
+	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+	bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+	ASSERT(bp);
+	ASSERT(!XFS_BUF_GETERROR(bp));
+	return bp;
+}
+
+/*
+ * Allocate a new btree cursor.
+ * The cursor is either for allocation (A) or bmap (B) or inodes (I).
+ */
+xfs_btree_cur_t *			/* new btree cursor */
+xfs_btree_init_cursor(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_buf_t	*agbp,		/* (A only) buffer for agf structure */
+					/* (I only) buffer for agi structure */
+	xfs_agnumber_t	agno,		/* (AI only) allocation group number */
+	xfs_btnum_t	btnum,		/* btree identifier */
+	xfs_inode_t	*ip,		/* (B only) inode owning the btree */
+	int		whichfork)	/* (B only) data or attr fork */
+{
+	xfs_agf_t	*agf;		/* (A) allocation group freespace */
+	xfs_agi_t	*agi;		/* (I) allocation group inodespace */
+	xfs_btree_cur_t *cur;		/* return value */
+	xfs_ifork_t	*ifp;		/* (I) inode fork pointer */
+	int		nlevels=0;	/* number of levels in the btree */
+
+	ASSERT(xfs_btree_cur_zone != NULL);
+	/*
+	 * Allocate a new cursor.
+	 */
+	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+	/*
+	 * Deduce the number of btree levels from the arguments.
+	 */
+	switch (btnum) {
+	case XFS_BTNUM_BNO:
+	case XFS_BTNUM_CNT:
+		agf = XFS_BUF_TO_AGF(agbp);
+		nlevels = INT_GET(agf->agf_levels[btnum], ARCH_CONVERT);
+		break;
+	case XFS_BTNUM_BMAP:
+		ifp = XFS_IFORK_PTR(ip, whichfork);
+		nlevels = INT_GET(ifp->if_broot->bb_level, ARCH_CONVERT) + 1;
+		break;
+	case XFS_BTNUM_INO:
+		agi = XFS_BUF_TO_AGI(agbp);
+		nlevels = INT_GET(agi->agi_level, ARCH_CONVERT);
+		break;
+	default:
+		ASSERT(0);
+	}
+	/*
+	 * Fill in the common fields.
+	 */
+	cur->bc_tp = tp;
+	cur->bc_mp = mp;
+	cur->bc_nlevels = nlevels;
+	cur->bc_btnum = btnum;
+	cur->bc_blocklog = mp->m_sb.sb_blocklog;
+	/*
+	 * Fill in private fields.
+	 */
+	switch (btnum) {
+	case XFS_BTNUM_BNO:
+	case XFS_BTNUM_CNT:
+		/*
+		 * Allocation btree fields.
+		 */
+		cur->bc_private.a.agbp = agbp;
+		cur->bc_private.a.agno = agno;
+		break;
+	case XFS_BTNUM_BMAP:
+		/*
+		 * Bmap btree fields.
+		 */
+		cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
+		cur->bc_private.b.ip = ip;
+		cur->bc_private.b.firstblock = NULLFSBLOCK;
+		cur->bc_private.b.flist = NULL;
+		cur->bc_private.b.allocated = 0;
+		cur->bc_private.b.flags = 0;
+		cur->bc_private.b.whichfork = whichfork;
+		break;
+	case XFS_BTNUM_INO:
+		/*
+		 * Inode allocation btree fields.
+		 */
+		cur->bc_private.i.agbp = agbp;
+		cur->bc_private.i.agno = agno;
+		break;
+	default:
+		ASSERT(0);
+	}
+	return cur;
+}
+
+/*
+ * Check for the cursor referring to the last block at the given level.
+ */
+int					/* 1=is last block, 0=not last block */
+xfs_btree_islastblock(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level)	/* level to check */
+{
+	xfs_btree_block_t	*block; /* generic btree block pointer */
+	xfs_buf_t		*bp;	/* buffer containing block */
+
+	block = xfs_btree_get_block(cur, level, &bp);
+	xfs_btree_check_block(cur, block, level, bp);
+	if (XFS_BTREE_LONG_PTRS(cur->bc_btnum))
+		return INT_GET(block->bb_u.l.bb_rightsib, ARCH_CONVERT) == NULLDFSBNO;
+	else
+		return INT_GET(block->bb_u.s.bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK;
+}
+
+/*
+ * Change the cursor to point to the last record in the current block
+ * at the given level.	Other levels are unaffected.
+ */
+int					/* success=1, failure=0 */
+xfs_btree_lastrec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level)	/* level to change */
+{
+	xfs_btree_block_t	*block; /* generic btree block pointer */
+	xfs_buf_t		*bp;	/* buffer containing block */
+
+	/*
+	 * Get the block pointer for this level.
+	 */
+	block = xfs_btree_get_block(cur, level, &bp);
+	xfs_btree_check_block(cur, block, level, bp);
+	/*
+	 * It's empty, there is no such record.
+	 */
+	if (INT_ISZERO(block->bb_h.bb_numrecs, ARCH_CONVERT))
+		return 0;
+	/*
+	 * Set the ptr value to numrecs, that's the last record/key.
+	 */
+	cur->bc_ptrs[level] = INT_GET(block->bb_h.bb_numrecs, ARCH_CONVERT);
+	return 1;
+}
+
+/*
+ * Compute first and last byte offsets for the fields given.
+ * Interprets the offsets table, which contains struct field offsets.
+ */
+void
+xfs_btree_offsets(
+	__int64_t	fields,		/* bitmask of fields */
+	const short	*offsets,	/* table of field offsets */
+	int		nbits,		/* number of bits to inspect */
+	int		*first,		/* output: first byte offset */
+	int		*last)		/* output: last byte offset */
+{
+	int		i;		/* current bit number */
+	__int64_t	imask;		/* mask for current bit number */
+
+	ASSERT(fields != 0);
+	/*
+	 * Find the lowest bit, so the first byte offset.
+	 */
+	for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
+		if (imask & fields) {
+			*first = offsets[i];
+			break;
+		}
+	}
+	/*
+	 * Find the highest bit, so the last byte offset.
+	 */
+	for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
+		if (imask & fields) {
+			*last = offsets[i + 1] - 1;
+			break;
+		}
+	}
+}
+
+/*
+ * Get a buffer for the block, return it read in.
+ * Long-form addressing.
+ */
+int					/* error */
+xfs_btree_read_bufl(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_fsblock_t	fsbno,		/* file system block number */
+	uint		lock,		/* lock flags for read_buf */
+	xfs_buf_t	**bpp,		/* buffer for fsbno */
+	int		refval)		/* ref count value for buffer */
+{
+	xfs_buf_t	*bp;		/* return value */
+	xfs_daddr_t		d;		/* real disk block address */
+	int		error;
+
+	ASSERT(fsbno != NULLFSBLOCK);
+	d = XFS_FSB_TO_DADDR(mp, fsbno);
+	if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+			mp->m_bsize, lock, &bp))) {
+		return error;
+	}
+	ASSERT(!bp || !XFS_BUF_GETERROR(bp));
+	if (bp != NULL) {
+		XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
+	}
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Get a buffer for the block, return it read in.
+ * Short-form addressing.
+ */
+int					/* error */
+xfs_btree_read_bufs(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	xfs_agblock_t	agbno,		/* allocation group block number */
+	uint		lock,		/* lock flags for read_buf */
+	xfs_buf_t	**bpp,		/* buffer for agno/agbno */
+	int		refval)		/* ref count value for buffer */
+{
+	xfs_buf_t	*bp;		/* return value */
+	xfs_daddr_t	d;		/* real disk block address */
+	int		error;
+
+	ASSERT(agno != NULLAGNUMBER);
+	ASSERT(agbno != NULLAGBLOCK);
+	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+	if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+					mp->m_bsize, lock, &bp))) {
+		return error;
+	}
+	ASSERT(!bp || !XFS_BUF_GETERROR(bp));
+	if (bp != NULL) {
+		switch (refval) {
+		case XFS_ALLOC_BTREE_REF:
+			XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
+			break;
+		case XFS_INO_BTREE_REF:
+			XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, refval);
+			break;
+		}
+	}
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Long-form addressing.
+ */
+/* ARGSUSED */
+void
+xfs_btree_reada_bufl(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_fsblock_t	fsbno,		/* file system block number */
+	xfs_extlen_t	count)		/* count of filesystem blocks */
+{
+	xfs_daddr_t		d;
+
+	ASSERT(fsbno != NULLFSBLOCK);
+	d = XFS_FSB_TO_DADDR(mp, fsbno);
+	xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
+}
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Short-form addressing.
+ */
+/* ARGSUSED */
+void
+xfs_btree_reada_bufs(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	xfs_agblock_t	agbno,		/* allocation group block number */
+	xfs_extlen_t	count)		/* count of filesystem blocks */
+{
+	xfs_daddr_t		d;
+
+	ASSERT(agno != NULLAGNUMBER);
+	ASSERT(agbno != NULLAGBLOCK);
+	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+	xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
+}
+
+/*
+ * Read-ahead btree blocks, at the given level.
+ * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
+ */
+int
+xfs_btree_readahead_core(
+	xfs_btree_cur_t		*cur,		/* btree cursor */
+	int			lev,		/* level in btree */
+	int			lr)		/* left/right bits */
+{
+	xfs_alloc_block_t	*a;
+	xfs_bmbt_block_t	*b;
+	xfs_inobt_block_t	*i;
+	int			rval = 0;
+
+	ASSERT(cur->bc_bufs[lev] != NULL);
+	cur->bc_ra[lev] |= lr;
+	switch (cur->bc_btnum) {
+	case XFS_BTNUM_BNO:
+	case XFS_BTNUM_CNT:
+		a = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]);
+		if ((lr & XFS_BTCUR_LEFTRA) && INT_GET(a->bb_leftsib, ARCH_CONVERT) != NULLAGBLOCK) {
+			xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+				INT_GET(a->bb_leftsib, ARCH_CONVERT), 1);
+			rval++;
+		}
+		if ((lr & XFS_BTCUR_RIGHTRA) && INT_GET(a->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+			xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+				INT_GET(a->bb_rightsib, ARCH_CONVERT), 1);
+			rval++;
+		}
+		break;
+	case XFS_BTNUM_BMAP:
+		b = XFS_BUF_TO_BMBT_BLOCK(cur->bc_bufs[lev]);
+		if ((lr & XFS_BTCUR_LEFTRA) && INT_GET(b->bb_leftsib, ARCH_CONVERT) != NULLDFSBNO) {
+			xfs_btree_reada_bufl(cur->bc_mp, INT_GET(b->bb_leftsib, ARCH_CONVERT), 1);
+			rval++;
+		}
+		if ((lr & XFS_BTCUR_RIGHTRA) && INT_GET(b->bb_rightsib, ARCH_CONVERT) != NULLDFSBNO) {
+			xfs_btree_reada_bufl(cur->bc_mp, INT_GET(b->bb_rightsib, ARCH_CONVERT), 1);
+			rval++;
+		}
+		break;
+	case XFS_BTNUM_INO:
+		i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]);
+		if ((lr & XFS_BTCUR_LEFTRA) && INT_GET(i->bb_leftsib, ARCH_CONVERT) != NULLAGBLOCK) {
+			xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno,
+				INT_GET(i->bb_leftsib, ARCH_CONVERT), 1);
+			rval++;
+		}
+		if ((lr & XFS_BTCUR_RIGHTRA) && INT_GET(i->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+			xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno,
+				INT_GET(i->bb_rightsib, ARCH_CONVERT), 1);
+			rval++;
+		}
+		break;
+	default:
+		ASSERT(0);
+	}
+	return rval;
+}
+
+/*
+ * Set the buffer for level "lev" in the cursor to bp, releasing
+ * any previous buffer.
+ */
+void
+xfs_btree_setbuf(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			lev,	/* level in btree */
+	xfs_buf_t		*bp)	/* new buffer to set */
+{
+	xfs_btree_block_t	*b;	/* btree block */
+	xfs_buf_t		*obp;	/* old buffer pointer */
+
+	obp = cur->bc_bufs[lev];
+	if (obp)
+		xfs_trans_brelse(cur->bc_tp, obp);
+	cur->bc_bufs[lev] = bp;
+	cur->bc_ra[lev] = 0;
+	if (!bp)
+		return;
+	b = XFS_BUF_TO_BLOCK(bp);
+	if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) {
+		if (INT_GET(b->bb_u.l.bb_leftsib, ARCH_CONVERT) == NULLDFSBNO)
+			cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+		if (INT_GET(b->bb_u.l.bb_rightsib, ARCH_CONVERT) == NULLDFSBNO)
+			cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+	} else {
+		if (INT_GET(b->bb_u.s.bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK)
+			cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
+		if (INT_GET(b->bb_u.s.bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK)
+			cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
+	}
+}
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
new file mode 100644
index 000000000000..7a6857ebaf3a
--- /dev/null
+++ b/fs/xfs/xfs_btree.h
@@ -0,0 +1,587 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_BTREE_H__
+#define __XFS_BTREE_H__
+
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * This nonsense is to make -wlint happy.
+ */
+#define XFS_LOOKUP_EQ	((xfs_lookup_t)XFS_LOOKUP_EQi)
+#define XFS_LOOKUP_LE	((xfs_lookup_t)XFS_LOOKUP_LEi)
+#define XFS_LOOKUP_GE	((xfs_lookup_t)XFS_LOOKUP_GEi)
+
+#define XFS_BTNUM_BNO	((xfs_btnum_t)XFS_BTNUM_BNOi)
+#define XFS_BTNUM_CNT	((xfs_btnum_t)XFS_BTNUM_CNTi)
+#define XFS_BTNUM_BMAP	((xfs_btnum_t)XFS_BTNUM_BMAPi)
+#define XFS_BTNUM_INO	((xfs_btnum_t)XFS_BTNUM_INOi)
+
+/*
+ * Short form header: space allocation btrees.
+ */
+typedef struct xfs_btree_sblock
+{
+	__uint32_t	bb_magic;	/* magic number for block type */
+	__uint16_t	bb_level;	/* 0 is a leaf */
+	__uint16_t	bb_numrecs;	/* current # of data records */
+	xfs_agblock_t	bb_leftsib;	/* left sibling block or NULLAGBLOCK */
+	xfs_agblock_t	bb_rightsib;	/* right sibling block or NULLAGBLOCK */
+} xfs_btree_sblock_t;
+
+/*
+ * Long form header: bmap btrees.
+ */
+typedef struct xfs_btree_lblock
+{
+	__uint32_t	bb_magic;	/* magic number for block type */
+	__uint16_t	bb_level;	/* 0 is a leaf */
+	__uint16_t	bb_numrecs;	/* current # of data records */
+	xfs_dfsbno_t	bb_leftsib;	/* left sibling block or NULLDFSBNO */
+	xfs_dfsbno_t	bb_rightsib;	/* right sibling block or NULLDFSBNO */
+} xfs_btree_lblock_t;
+
+/*
+ * Combined header and structure, used by common code.
+ */
+typedef struct xfs_btree_hdr
+{
+	__uint32_t	bb_magic;	/* magic number for block type */
+	__uint16_t	bb_level;	/* 0 is a leaf */
+	__uint16_t	bb_numrecs;	/* current # of data records */
+} xfs_btree_hdr_t;
+
+typedef struct xfs_btree_block
+{
+	xfs_btree_hdr_t bb_h;		/* header */
+	union		{
+		struct	{
+			xfs_agblock_t	bb_leftsib;
+			xfs_agblock_t	bb_rightsib;
+		}	s;		/* short form pointers */
+		struct	{
+			xfs_dfsbno_t	bb_leftsib;
+			xfs_dfsbno_t	bb_rightsib;
+		}	l;		/* long form pointers */
+	}		bb_u;		/* rest */
+} xfs_btree_block_t;
+
+/*
+ * For logging record fields.
+ */
+#define XFS_BB_MAGIC		0x01
+#define XFS_BB_LEVEL		0x02
+#define XFS_BB_NUMRECS		0x04
+#define XFS_BB_LEFTSIB		0x08
+#define XFS_BB_RIGHTSIB		0x10
+#define XFS_BB_NUM_BITS		5
+#define XFS_BB_ALL_BITS		((1 << XFS_BB_NUM_BITS) - 1)
+
+/*
+ * Boolean to select which form of xfs_btree_block_t.bb_u to use.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BTREE_LONG_PTRS)
+int xfs_btree_long_ptrs(xfs_btnum_t btnum);
+#define XFS_BTREE_LONG_PTRS(btnum)	((btnum) == XFS_BTNUM_BMAP)
+#else
+#define XFS_BTREE_LONG_PTRS(btnum)	((btnum) == XFS_BTNUM_BMAP)
+#endif
+
+/*
+ * Magic numbers for btree blocks.
+ */
+extern const __uint32_t xfs_magics[];
+
+/*
+ * Maximum and minimum records in a btree block.
+ * Given block size, type prefix, and leaf flag (0 or 1).
+ * The divisor below is equivalent to lf ? (e1) : (e2) but that produces
+ * compiler warnings.
+ */
+#define XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf)	\
+	((int)(((bsz) - (uint)sizeof(t ## _block_t)) / \
+	 (((lf) * (uint)sizeof(t ## _rec_t)) + \
+	  ((1 - (lf)) * \
+	   ((uint)sizeof(t ## _key_t) + (uint)sizeof(t ## _ptr_t))))))
+#define XFS_BTREE_BLOCK_MINRECS(bsz,t,lf)	\
+	(XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) / 2)
+
+/*
+ * Record, key, and pointer address calculation macros.
+ * Given block size, type prefix, block pointer, and index of requested entry
+ * (first entry numbered 1).
+ */
+#define XFS_BTREE_REC_ADDR(bsz,t,bb,i,mxr)	\
+	((t ## _rec_t *)((char *)(bb) + sizeof(t ## _block_t) + \
+	 ((i) - 1) * sizeof(t ## _rec_t)))
+#define XFS_BTREE_KEY_ADDR(bsz,t,bb,i,mxr)	\
+	((t ## _key_t *)((char *)(bb) + sizeof(t ## _block_t) + \
+	 ((i) - 1) * sizeof(t ## _key_t)))
+#define XFS_BTREE_PTR_ADDR(bsz,t,bb,i,mxr)	\
+	((t ## _ptr_t *)((char *)(bb) + sizeof(t ## _block_t) + \
+	 (mxr) * sizeof(t ## _key_t) + ((i) - 1) * sizeof(t ## _ptr_t)))
+
+#define XFS_BTREE_MAXLEVELS	8	/* max of all btrees */
+
+/*
+ * Btree cursor structure.
+ * This collects all information needed by the btree code in one place.
+ */
+typedef struct xfs_btree_cur
+{
+	struct xfs_trans	*bc_tp; /* transaction we're in, if any */
+	struct xfs_mount	*bc_mp; /* file system mount struct */
+	union {
+		xfs_alloc_rec_t		a;
+		xfs_bmbt_irec_t		b;
+		xfs_inobt_rec_t		i;
+	}		bc_rec;		/* current insert/search record value */
+	struct xfs_buf	*bc_bufs[XFS_BTREE_MAXLEVELS];	/* buf ptr per level */
+	int		bc_ptrs[XFS_BTREE_MAXLEVELS];	/* key/record # */
+	__uint8_t	bc_ra[XFS_BTREE_MAXLEVELS];	/* readahead bits */
+#define XFS_BTCUR_LEFTRA	1	/* left sibling has been read-ahead */
+#define XFS_BTCUR_RIGHTRA	2	/* right sibling has been read-ahead */
+	__uint8_t	bc_nlevels;	/* number of levels in the tree */
+	__uint8_t	bc_blocklog;	/* log2(blocksize) of btree blocks */
+	xfs_btnum_t	bc_btnum;	/* identifies which btree type */
+	union {
+		struct {			/* needed for BNO, CNT */
+			struct xfs_buf	*agbp;	/* agf buffer pointer */
+			xfs_agnumber_t	agno;	/* ag number */
+		} a;
+		struct {			/* needed for BMAP */
+			struct xfs_inode *ip;	/* pointer to our inode */
+			struct xfs_bmap_free *flist;	/* list to free after */
+			xfs_fsblock_t	firstblock;	/* 1st blk allocated */
+			int		allocated;	/* count of alloced */
+			short		forksize;	/* fork's inode space */
+			char		whichfork;	/* data or attr fork */
+			char		flags;		/* flags */
+#define XFS_BTCUR_BPRV_WASDEL	1			/* was delayed */
+		} b;
+		struct {			/* needed for INO */
+			struct xfs_buf	*agbp;	/* agi buffer pointer */
+			xfs_agnumber_t	agno;	/* ag number */
+		} i;
+	}		bc_private;	/* per-btree type data */
+} xfs_btree_cur_t;
+
+#define XFS_BTREE_NOERROR	0
+#define XFS_BTREE_ERROR		1
+
+/*
+ * Convert from buffer to btree block header.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_BLOCK)
+xfs_btree_block_t *xfs_buf_to_block(struct xfs_buf *bp);
+#define XFS_BUF_TO_BLOCK(bp)	xfs_buf_to_block(bp)
+#else
+#define XFS_BUF_TO_BLOCK(bp)	((xfs_btree_block_t *)(XFS_BUF_PTR(bp)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_LBLOCK)
+xfs_btree_lblock_t *xfs_buf_to_lblock(struct xfs_buf *bp);
+#define XFS_BUF_TO_LBLOCK(bp)	xfs_buf_to_lblock(bp)
+#else
+#define XFS_BUF_TO_LBLOCK(bp)	((xfs_btree_lblock_t *)(XFS_BUF_PTR(bp)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_SBLOCK)
+xfs_btree_sblock_t *xfs_buf_to_sblock(struct xfs_buf *bp);
+#define XFS_BUF_TO_SBLOCK(bp)	xfs_buf_to_sblock(bp)
+#else
+#define XFS_BUF_TO_SBLOCK(bp)	((xfs_btree_sblock_t *)(XFS_BUF_PTR(bp)))
+#endif
+
+#ifdef __KERNEL__
+
+#ifdef DEBUG
+/*
+ * Debug routine: check that block header is ok.
+ */
+void
+xfs_btree_check_block(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_btree_block_t	*block, /* generic btree block pointer */
+	int			level,	/* level of the btree block */
+	struct xfs_buf		*bp);	/* buffer containing block, if any */
+
+/*
+ * Debug routine: check that keys are in the right order.
+ */
+void
+xfs_btree_check_key(
+	xfs_btnum_t		btnum,	/* btree identifier */
+	void			*ak1,	/* pointer to left (lower) key */
+	void			*ak2);	/* pointer to right (higher) key */
+
+/*
+ * Debug routine: check that records are in the right order.
+ */
+void
+xfs_btree_check_rec(
+	xfs_btnum_t		btnum,	/* btree identifier */
+	void			*ar1,	/* pointer to left (lower) record */
+	void			*ar2);	/* pointer to right (higher) record */
+#else
+#define xfs_btree_check_block(a,b,c,d)
+#define xfs_btree_check_key(a,b,c)
+#define xfs_btree_check_rec(a,b,c)
+#endif	/* DEBUG */
+
+/*
+ * Checking routine: check that long form block header is ok.
+ */
+int					/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lblock(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_btree_lblock_t	*block, /* btree long form block pointer */
+	int			level,	/* level of the btree block */
+	struct xfs_buf		*bp);	/* buffer containing block, if any */
+
+/*
+ * Checking routine: check that (long) pointer is ok.
+ */
+int					/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_dfsbno_t		ptr,	/* btree block disk address */
+	int			level); /* btree block level */
+
+/*
+ * Checking routine: check that short form block header is ok.
+ */
+int					/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sblock(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_btree_sblock_t	*block, /* btree short form block pointer */
+	int			level,	/* level of the btree block */
+	struct xfs_buf		*bp);	/* buffer containing block */
+
+/*
+ * Checking routine: check that (short) pointer is ok.
+ */
+int					/* error (0 or EFSCORRUPTED) */
+xfs_btree_check_sptr(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_agblock_t		ptr,	/* btree block disk address */
+	int			level); /* btree block level */
+
+/*
+ * Delete the btree cursor.
+ */
+void
+xfs_btree_del_cursor(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			error); /* del because of error */
+
+/*
+ * Duplicate the btree cursor.
+ * Allocate a new one, copy the record, re-get the buffers.
+ */
+int					/* error */
+xfs_btree_dup_cursor(
+	xfs_btree_cur_t		*cur,	/* input cursor */
+	xfs_btree_cur_t		**ncur);/* output cursor */
+
+/*
+ * Change the cursor to point to the first record in the current block
+ * at the given level.	Other levels are unaffected.
+ */
+int					/* success=1, failure=0 */
+xfs_btree_firstrec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level); /* level to change */
+
+/*
+ * Retrieve the block pointer from the cursor at the given level.
+ * This may be a bmap btree root or from a buffer.
+ */
+xfs_btree_block_t *			/* generic btree block pointer */
+xfs_btree_get_block(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level in btree */
+	struct xfs_buf		**bpp); /* buffer containing the block */
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Long-form addressing.
+ */
+struct xfs_buf *				/* buffer for fsbno */
+xfs_btree_get_bufl(
+	struct xfs_mount	*mp,	/* file system mount point */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_fsblock_t		fsbno,	/* file system block number */
+	uint			lock);	/* lock flags for get_buf */
+
+/*
+ * Get a buffer for the block, return it with no data read.
+ * Short-form addressing.
+ */
+struct xfs_buf *				/* buffer for agno/agbno */
+xfs_btree_get_bufs(
+	struct xfs_mount	*mp,	/* file system mount point */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	xfs_agblock_t		agbno,	/* allocation group block number */
+	uint			lock);	/* lock flags for get_buf */
+
+/*
+ * Allocate a new btree cursor.
+ * The cursor is either for allocation (A) or bmap (B).
+ */
+xfs_btree_cur_t *			/* new btree cursor */
+xfs_btree_init_cursor(
+	struct xfs_mount	*mp,	/* file system mount point */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	struct xfs_buf		*agbp,	/* (A only) buffer for agf structure */
+	xfs_agnumber_t		agno,	/* (A only) allocation group number */
+	xfs_btnum_t		btnum,	/* btree identifier */
+	struct xfs_inode	*ip,	/* (B only) inode owning the btree */
+	int			whichfork); /* (B only) data/attr fork */
+
+/*
+ * Check for the cursor referring to the last block at the given level.
+ */
+int					/* 1=is last block, 0=not last block */
+xfs_btree_islastblock(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level); /* level to check */
+
+/*
+ * Change the cursor to point to the last record in the current block
+ * at the given level.	Other levels are unaffected.
+ */
+int					/* success=1, failure=0 */
+xfs_btree_lastrec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level); /* level to change */
+
+/*
+ * Compute first and last byte offsets for the fields given.
+ * Interprets the offsets table, which contains struct field offsets.
+ */
+void
+xfs_btree_offsets(
+	__int64_t		fields, /* bitmask of fields */
+	const short		*offsets,/* table of field offsets */
+	int			nbits,	/* number of bits to inspect */
+	int			*first, /* output: first byte offset */
+	int			*last); /* output: last byte offset */
+
+/*
+ * Get a buffer for the block, return it read in.
+ * Long-form addressing.
+ */
+int					/* error */
+xfs_btree_read_bufl(
+	struct xfs_mount	*mp,	/* file system mount point */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_fsblock_t		fsbno,	/* file system block number */
+	uint			lock,	/* lock flags for read_buf */
+	struct xfs_buf		**bpp,	/* buffer for fsbno */
+	int			refval);/* ref count value for buffer */
+
+/*
+ * Get a buffer for the block, return it read in.
+ * Short-form addressing.
+ */
+int					/* error */
+xfs_btree_read_bufs(
+	struct xfs_mount	*mp,	/* file system mount point */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	xfs_agblock_t		agbno,	/* allocation group block number */
+	uint			lock,	/* lock flags for read_buf */
+	struct xfs_buf		**bpp,	/* buffer for agno/agbno */
+	int			refval);/* ref count value for buffer */
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Long-form addressing.
+ */
+void					/* error */
+xfs_btree_reada_bufl(
+	struct xfs_mount	*mp,	/* file system mount point */
+	xfs_fsblock_t		fsbno,	/* file system block number */
+	xfs_extlen_t		count); /* count of filesystem blocks */
+
+/*
+ * Read-ahead the block, don't wait for it, don't return a buffer.
+ * Short-form addressing.
+ */
+void					/* error */
+xfs_btree_reada_bufs(
+	struct xfs_mount	*mp,	/* file system mount point */
+	xfs_agnumber_t		agno,	/* allocation group number */
+	xfs_agblock_t		agbno,	/* allocation group block number */
+	xfs_extlen_t		count); /* count of filesystem blocks */
+
+/*
+ * Read-ahead btree blocks, at the given level.
+ * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
+ */
+int					/* readahead block count */
+xfs_btree_readahead_core(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			lev,	/* level in btree */
+	int			lr);	/* left/right bits */
+
+static inline int			/* readahead block count */
+xfs_btree_readahead(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			lev,	/* level in btree */
+	int			lr)	/* left/right bits */
+{
+	if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
+		return 0;
+
+	return xfs_btree_readahead_core(cur, lev, lr);
+}
+
+
+/*
+ * Set the buffer for level "lev" in the cursor to bp, releasing
+ * any previous buffer.
+ */
+void
+xfs_btree_setbuf(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			lev,	/* level in btree */
+	struct xfs_buf		*bp);	/* new buffer to set */
+
+#endif	/* __KERNEL__ */
+
+
+/*
+ * Min and max functions for extlen, agblock, fileoff, and filblks types.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_EXTLEN_MIN)
+xfs_extlen_t xfs_extlen_min(xfs_extlen_t a, xfs_extlen_t b);
+#define XFS_EXTLEN_MIN(a,b)	xfs_extlen_min(a,b)
+#else
+#define XFS_EXTLEN_MIN(a,b)	\
+	((xfs_extlen_t)(a) < (xfs_extlen_t)(b) ? \
+	 (xfs_extlen_t)(a) : (xfs_extlen_t)(b))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_EXTLEN_MAX)
+xfs_extlen_t xfs_extlen_max(xfs_extlen_t a, xfs_extlen_t b);
+#define XFS_EXTLEN_MAX(a,b)	xfs_extlen_max(a,b)
+#else
+#define XFS_EXTLEN_MAX(a,b)	\
+	((xfs_extlen_t)(a) > (xfs_extlen_t)(b) ? \
+	 (xfs_extlen_t)(a) : (xfs_extlen_t)(b))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGBLOCK_MIN)
+xfs_agblock_t xfs_agblock_min(xfs_agblock_t a, xfs_agblock_t b);
+#define XFS_AGBLOCK_MIN(a,b)	xfs_agblock_min(a,b)
+#else
+#define XFS_AGBLOCK_MIN(a,b)	\
+	((xfs_agblock_t)(a) < (xfs_agblock_t)(b) ? \
+	 (xfs_agblock_t)(a) : (xfs_agblock_t)(b))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGBLOCK_MAX)
+xfs_agblock_t xfs_agblock_max(xfs_agblock_t a, xfs_agblock_t b);
+#define XFS_AGBLOCK_MAX(a,b)	xfs_agblock_max(a,b)
+#else
+#define XFS_AGBLOCK_MAX(a,b)	\
+	((xfs_agblock_t)(a) > (xfs_agblock_t)(b) ? \
+	 (xfs_agblock_t)(a) : (xfs_agblock_t)(b))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILEOFF_MIN)
+xfs_fileoff_t xfs_fileoff_min(xfs_fileoff_t a, xfs_fileoff_t b);
+#define XFS_FILEOFF_MIN(a,b)	xfs_fileoff_min(a,b)
+#else
+#define XFS_FILEOFF_MIN(a,b)	\
+	((xfs_fileoff_t)(a) < (xfs_fileoff_t)(b) ? \
+	 (xfs_fileoff_t)(a) : (xfs_fileoff_t)(b))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILEOFF_MAX)
+xfs_fileoff_t xfs_fileoff_max(xfs_fileoff_t a, xfs_fileoff_t b);
+#define XFS_FILEOFF_MAX(a,b)	xfs_fileoff_max(a,b)
+#else
+#define XFS_FILEOFF_MAX(a,b)	\
+	((xfs_fileoff_t)(a) > (xfs_fileoff_t)(b) ? \
+	 (xfs_fileoff_t)(a) : (xfs_fileoff_t)(b))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILBLKS_MIN)
+xfs_filblks_t xfs_filblks_min(xfs_filblks_t a, xfs_filblks_t b);
+#define XFS_FILBLKS_MIN(a,b)	xfs_filblks_min(a,b)
+#else
+#define XFS_FILBLKS_MIN(a,b)	\
+	((xfs_filblks_t)(a) < (xfs_filblks_t)(b) ? \
+	 (xfs_filblks_t)(a) : (xfs_filblks_t)(b))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FILBLKS_MAX)
+xfs_filblks_t xfs_filblks_max(xfs_filblks_t a, xfs_filblks_t b);
+#define XFS_FILBLKS_MAX(a,b)	xfs_filblks_max(a,b)
+#else
+#define XFS_FILBLKS_MAX(a,b)	\
+	((xfs_filblks_t)(a) > (xfs_filblks_t)(b) ? \
+	 (xfs_filblks_t)(a) : (xfs_filblks_t)(b))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_SANITY_CHECK)
+int xfs_fsb_sanity_check(struct xfs_mount *mp, xfs_fsblock_t fsb);
+#define XFS_FSB_SANITY_CHECK(mp,fsb)	xfs_fsb_sanity_check(mp,fsb)
+#else
+#define XFS_FSB_SANITY_CHECK(mp,fsb)	\
+	(XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
+	 XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks)
+#endif
+
+/*
+ * Macros to set EFSCORRUPTED & return/branch.
+ */
+#define XFS_WANT_CORRUPTED_GOTO(x,l)	\
+	{ \
+		int fs_is_ok = (x); \
+		ASSERT(fs_is_ok); \
+		if (!fs_is_ok) { \
+			error = XFS_ERROR(EFSCORRUPTED); \
+			goto l; \
+		} \
+	}
+
+#define XFS_WANT_CORRUPTED_RETURN(x)	\
+	{ \
+		int fs_is_ok = (x); \
+		ASSERT(fs_is_ok); \
+		if (!fs_is_ok) \
+			return XFS_ERROR(EFSCORRUPTED); \
+	}
+
+#endif	/* __XFS_BTREE_H__ */
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
new file mode 100644
index 000000000000..f557a2c28419
--- /dev/null
+++ b/fs/xfs/xfs_buf.h
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_BUF_H__
+#define __XFS_BUF_H__
+
+/* These are just for xfs_syncsub... it sets an internal variable
+ * then passes it to VOP_FLUSH_PAGES or adds the flags to a newly gotten buf_t
+ */
+#define XFS_B_ASYNC  PBF_ASYNC
+#define XFS_B_DELWRI PBF_DELWRI
+#define XFS_B_READ   PBF_READ
+#define XFS_B_WRITE  PBF_WRITE
+#define XFS_B_STALE  PBF_STALE
+#define XFS_BUF_TRYLOCK		PBF_TRYLOCK
+#define XFS_INCORE_TRYLOCK	PBF_TRYLOCK
+#define XFS_BUF_LOCK		PBF_LOCK
+#define XFS_BUF_MAPPED		PBF_MAPPED
+
+#define BUF_BUSY	PBF_DONT_BLOCK
+
+#define XFS_BUF_BFLAGS(x)	 ((x)->pb_flags)  /* debugging routines might need this */
+#define XFS_BUF_ZEROFLAGS(x)	\
+	((x)->pb_flags &= ~(PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_SYNC|PBF_DELWRI))
+
+#define XFS_BUF_STALE(x)	     ((x)->pb_flags |= XFS_B_STALE)
+#define XFS_BUF_UNSTALE(x)	     ((x)->pb_flags &= ~XFS_B_STALE)
+#define XFS_BUF_ISSTALE(x)	     ((x)->pb_flags & XFS_B_STALE)
+#define XFS_BUF_SUPER_STALE(x)	 (x)->pb_flags |= XFS_B_STALE;\
+				 xfs_buf_undelay(x);\
+				 (x)->pb_flags &= ~(PBF_PARTIAL|PBF_NONE)
+
+static inline void xfs_buf_undelay(page_buf_t *pb)
+{
+	if (pb->pb_flags & PBF_DELWRI) {
+		if (pb->pb_list.next != &pb->pb_list) {
+			pagebuf_delwri_dequeue(pb);
+			pagebuf_rele(pb);
+		} else {
+			pb->pb_flags &= ~PBF_DELWRI;
+		}
+	}
+}
+
+#define XFS_BUF_DELAYWRITE(x)	 ((x)->pb_flags |= PBF_DELWRI)
+#define XFS_BUF_UNDELAYWRITE(x)	 xfs_buf_undelay(x)
+#define XFS_BUF_ISDELAYWRITE(x)	 ((x)->pb_flags & PBF_DELWRI)
+
+#define XFS_BUF_ERROR(x,no)	 pagebuf_ioerror(x,no)
+#define XFS_BUF_GETERROR(x)	 pagebuf_geterror(x)
+#define XFS_BUF_ISERROR(x)	 (pagebuf_geterror(x)?1:0)
+
+#define XFS_BUF_DONE(x)		 ((x)->pb_flags &= ~(PBF_PARTIAL|PBF_NONE))
+#define XFS_BUF_UNDONE(x)	 ((x)->pb_flags |= PBF_PARTIAL|PBF_NONE)
+#define XFS_BUF_ISDONE(x)	 (!(PBF_NOT_DONE(x)))
+
+#define XFS_BUF_BUSY(x)		 ((x)->pb_flags |= PBF_FORCEIO)
+#define XFS_BUF_UNBUSY(x)	 ((x)->pb_flags &= ~PBF_FORCEIO)
+#define XFS_BUF_ISBUSY(x)	 (1)
+
+#define XFS_BUF_ASYNC(x)	 ((x)->pb_flags |= PBF_ASYNC)
+#define XFS_BUF_UNASYNC(x)	 ((x)->pb_flags &= ~PBF_ASYNC)
+#define XFS_BUF_ISASYNC(x)	 ((x)->pb_flags & PBF_ASYNC)
+
+#define XFS_BUF_FLUSH(x)	 ((x)->pb_flags |= PBF_FLUSH)
+#define XFS_BUF_UNFLUSH(x)	 ((x)->pb_flags &= ~PBF_FLUSH)
+#define XFS_BUF_ISFLUSH(x)	 ((x)->pb_flags & PBF_FLUSH)
+
+#define XFS_BUF_SHUT(x)		 printk("XFS_BUF_SHUT not implemented yet\n")
+#define XFS_BUF_UNSHUT(x)	 printk("XFS_BUF_UNSHUT not implemented yet\n")
+#define XFS_BUF_ISSHUT(x)	 (0)
+
+#define XFS_BUF_HOLD(x)		pagebuf_hold(x)
+#define XFS_BUF_READ(x)		((x)->pb_flags |= PBF_READ)
+#define XFS_BUF_UNREAD(x)	((x)->pb_flags &= ~PBF_READ)
+#define XFS_BUF_ISREAD(x)	((x)->pb_flags & PBF_READ)
+
+#define XFS_BUF_WRITE(x)	((x)->pb_flags |= PBF_WRITE)
+#define XFS_BUF_UNWRITE(x)	((x)->pb_flags &= ~PBF_WRITE)
+#define XFS_BUF_ISWRITE(x)	((x)->pb_flags & PBF_WRITE)
+
+#define XFS_BUF_ISUNINITIAL(x)	 ((x)->pb_flags & PBF_UNINITIAL)
+#define XFS_BUF_UNUNINITIAL(x)	 ((x)->pb_flags &= ~PBF_UNINITIAL)
+
+#define XFS_BUF_BP_ISMAPPED(bp)	 1
+
+typedef struct page_buf_s xfs_buf_t;
+#define xfs_buf page_buf_s
+
+typedef struct pb_target xfs_buftarg_t;
+#define xfs_buftarg pb_target
+
+#define XFS_BUF_IODONE_FUNC(buf)	(buf)->pb_iodone
+#define XFS_BUF_SET_IODONE_FUNC(buf, func)	\
+			(buf)->pb_iodone = (func)
+#define XFS_BUF_CLR_IODONE_FUNC(buf)		\
+			(buf)->pb_iodone = NULL
+#define XFS_BUF_SET_BDSTRAT_FUNC(buf, func)	\
+			(buf)->pb_strat = (func)
+#define XFS_BUF_CLR_BDSTRAT_FUNC(buf)		\
+			(buf)->pb_strat = NULL
+
+#define XFS_BUF_FSPRIVATE(buf, type)		\
+			((type)(buf)->pb_fspriv)
+#define XFS_BUF_SET_FSPRIVATE(buf, value)	\
+			(buf)->pb_fspriv = (void *)(value)
+#define XFS_BUF_FSPRIVATE2(buf, type)		\
+			((type)(buf)->pb_fspriv2)
+#define XFS_BUF_SET_FSPRIVATE2(buf, value)	\
+			(buf)->pb_fspriv2 = (void *)(value)
+#define XFS_BUF_FSPRIVATE3(buf, type)		\
+			((type)(buf)->pb_fspriv3)
+#define XFS_BUF_SET_FSPRIVATE3(buf, value)	\
+			(buf)->pb_fspriv3  = (void *)(value)
+#define XFS_BUF_SET_START(buf)
+
+#define XFS_BUF_SET_BRELSE_FUNC(buf, value) \
+			(buf)->pb_relse = (value)
+
+#define XFS_BUF_PTR(bp)		(xfs_caddr_t)((bp)->pb_addr)
+
+extern inline xfs_caddr_t xfs_buf_offset(page_buf_t *bp, off_t offset)
+{
+	if (bp->pb_flags & PBF_MAPPED)
+		return XFS_BUF_PTR(bp) + offset;
+	return (xfs_caddr_t) pagebuf_offset(bp, offset);
+}
+
+#define XFS_BUF_SET_PTR(bp, val, count)		\
+				pagebuf_associate_memory(bp, val, count)
+#define XFS_BUF_ADDR(bp)	((bp)->pb_bn)
+#define XFS_BUF_OFFSET(bp)	((bp)->pb_file_offset >> 9)
+#define XFS_BUF_SET_ADDR(bp, blk)		\
+			((bp)->pb_bn = (page_buf_daddr_t)(blk))
+#define XFS_BUF_COUNT(bp)	((bp)->pb_count_desired)
+#define XFS_BUF_SET_COUNT(bp, cnt)		\
+			((bp)->pb_count_desired = cnt)
+#define XFS_BUF_SIZE(bp)	((bp)->pb_buffer_length)
+#define XFS_BUF_SET_SIZE(bp, cnt)		\
+			((bp)->pb_buffer_length = cnt)
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)
+#define XFS_BUF_SET_VTYPE(bp, type)
+#define XFS_BUF_SET_REF(bp, ref)
+
+#define XFS_BUF_ISPINNED(bp)   pagebuf_ispin(bp)
+
+#define XFS_BUF_VALUSEMA(bp)	pagebuf_lock_value(bp)
+#define XFS_BUF_CPSEMA(bp)	(pagebuf_cond_lock(bp) == 0)
+#define XFS_BUF_VSEMA(bp)	pagebuf_unlock(bp)
+#define XFS_BUF_PSEMA(bp,x)	pagebuf_lock(bp)
+#define XFS_BUF_V_IODONESEMA(bp) up(&bp->pb_iodonesema);
+
+/* setup the buffer target from a buftarg structure */
+#define XFS_BUF_SET_TARGET(bp, target)	\
+	(bp)->pb_target = (target)
+
+#define XFS_BUF_TARGET_DEV(bp)	((bp)->pb_target->pbr_dev)
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)
+#define XFS_BUF_SET_VTYPE(bp, type)
+#define XFS_BUF_SET_REF(bp, ref)
+
+#define xfs_buf_read(target, blkno, len, flags) \
+		pagebuf_get((target), (blkno), (len), \
+			PBF_LOCK | PBF_READ | PBF_MAPPED | PBF_MAPPABLE)
+#define xfs_buf_get(target, blkno, len, flags) \
+		pagebuf_get((target), (blkno), (len), \
+			PBF_LOCK | PBF_MAPPED | PBF_MAPPABLE)
+
+#define xfs_buf_read_flags(target, blkno, len, flags) \
+		pagebuf_get((target), (blkno), (len), \
+			PBF_READ | PBF_MAPPABLE | flags)
+#define xfs_buf_get_flags(target, blkno, len, flags) \
+		pagebuf_get((target), (blkno), (len), \
+			PBF_MAPPABLE | flags)
+
+static inline int	xfs_bawrite(void *mp, page_buf_t *bp)
+{
+	extern int	xfs_bdstrat_cb(struct xfs_buf *);
+	int ret;
+
+	bp->pb_fspriv3 = mp;
+	bp->pb_strat = xfs_bdstrat_cb;
+	xfs_buf_undelay(bp);
+	if ((ret = pagebuf_iostart(bp, PBF_WRITE | PBF_ASYNC)) == 0)
+		blk_run_queues();
+	return ret;
+}
+
+static inline void	xfs_buf_relse(page_buf_t *bp)
+{
+	if ((bp->pb_flags & _PBF_LOCKABLE) && !bp->pb_relse)
+		pagebuf_unlock(bp);
+
+	pagebuf_rele(bp);
+}
+
+
+#define xfs_bpin(bp)		pagebuf_pin(bp)
+#define xfs_bunpin(bp)		pagebuf_unpin(bp)
+
+#ifdef PAGEBUF_TRACE
+# define PB_DEFINE_TRACES
+# include <pagebuf/page_buf_trace.h>
+# define xfs_buftrace(id, bp)	PB_TRACE(bp, PB_TRACE_REC(external), (void *)id)
+#else
+# define xfs_buftrace(id, bp)	do { } while (0)
+#endif
+
+
+#define xfs_biodone(pb)		    \
+	    pagebuf_iodone(pb)
+
+#define xfs_incore(buftarg,blkno,len,lockit) \
+	    pagebuf_find(buftarg, blkno ,len, lockit)
+
+
+#define xfs_biomove(pb, off, len, data, rw) \
+	    pagebuf_iomove((pb), (off), (len), (data), \
+		((rw) == XFS_B_WRITE) ? PBRW_WRITE : PBRW_READ)
+
+#define xfs_biozero(pb, off, len) \
+	    pagebuf_iomove((pb), (off), (len), NULL, PBRW_ZERO)
+
+
+static inline int	XFS_bwrite(page_buf_t *pb)
+{
+	int	sync = (pb->pb_flags & PBF_ASYNC) == 0;
+	int	error;
+
+	pb->pb_flags |= PBF_SYNC;
+
+	xfs_buf_undelay(pb);
+
+	__pagebuf_iorequest(pb);
+
+	if (sync) {
+		error = pagebuf_iowait(pb);
+		xfs_buf_relse(pb);
+	} else {
+		blk_run_queues();
+		error = 0;
+	}
+
+	return error;
+}
+
+
+#define XFS_bdwrite(pb)		     \
+	    pagebuf_iostart(pb, PBF_DELWRI | PBF_ASYNC)
+
+static inline int xfs_bdwrite(void *mp, page_buf_t *bp)
+{
+	extern int	xfs_bdstrat_cb(struct xfs_buf *);
+
+	bp->pb_strat = xfs_bdstrat_cb;
+	bp->pb_fspriv3 = mp;
+
+	return pagebuf_iostart(bp, PBF_DELWRI | PBF_ASYNC);
+}
+
+#define XFS_bdstrat(bp) pagebuf_iorequest(bp)
+
+#define xfs_iowait(pb)	pagebuf_iowait(pb)
+
+
+/*
+ * Go through all incore buffers, and release buffers
+ * if they belong to the given device. This is used in
+ * filesystem error handling to preserve the consistency
+ * of its metadata.
+ */
+
+extern void XFS_bflush(xfs_buftarg_t *);
+#define xfs_binval(buftarg) XFS_bflush(buftarg)
+
+#define xfs_incore_relse(buftarg,delwri_only,wait)	\
+       pagebuf_target_clear(buftarg)
+
+
+#define xfs_baread(target, rablkno, ralen)  \
+	pagebuf_readahead((target), (rablkno), \
+			  (ralen), PBF_DONT_BLOCK)
+
+#define XFS_getrbuf(sleep,mp)	\
+	pagebuf_get_empty((mp)->m_ddev_targp)
+#define XFS_ngetrbuf(len,mp)	\
+	pagebuf_get_no_daddr(len,(mp)->m_ddev_targp)
+#define XFS_freerbuf(bp)	pagebuf_free(bp)
+#define XFS_nfreerbuf(bp)	pagebuf_free(bp)
+
+#endif
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
new file mode 100644
index 000000000000..ac62646cacde
--- /dev/null
+++ b/fs/xfs/xfs_buf_item.c
@@ -0,0 +1,1203 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * This file contains the implementation of the xfs_buf_log_item.
+ * It contains the item operations used to manipulate the buf log
+ * items as well as utility routines used by the buffer specific
+ * transaction routines.
+ */
+
+#include <xfs.h>
+
+
+#define ROUNDUPNBWORD(x)	(((x) + (NBWORD - 1)) & ~(NBWORD - 1))
+
+kmem_zone_t	*xfs_buf_item_zone;
+
+#ifdef XFS_TRANS_DEBUG
+/*
+ * This function uses an alternate strategy for tracking the bytes
+ * that the user requests to be logged.	 This can then be used
+ * in conjunction with the bli_orig array in the buf log item to
+ * catch bugs in our callers' code.
+ *
+ * We also double check the bits set in xfs_buf_item_log using a
+ * simple algorithm to check that every byte is accounted for.
+ */
+STATIC void
+xfs_buf_item_log_debug(
+	xfs_buf_log_item_t	*bip,
+	uint			first,
+	uint			last)
+{
+	uint	x;
+	uint	byte;
+	uint	nbytes;
+	uint	chunk_num;
+	uint	word_num;
+	uint	bit_num;
+	uint	bit_set;
+	uint	*wordp;
+
+	ASSERT(bip->bli_logged != NULL);
+	byte = first;
+	nbytes = last - first + 1;
+	bfset(bip->bli_logged, first, nbytes);
+	for (x = 0; x < nbytes; x++) {
+		chunk_num = byte >> XFS_BLI_SHIFT;
+		word_num = chunk_num >> BIT_TO_WORD_SHIFT;
+		bit_num = chunk_num & (NBWORD - 1);
+		wordp = &(bip->bli_format.blf_data_map[word_num]);
+		bit_set = *wordp & (1 << bit_num);
+		ASSERT(bit_set);
+		byte++;
+	}
+}
+
+/*
+ * This function is called when we flush something into a buffer without
+ * logging it.	This happens for things like inodes which are logged
+ * separately from the buffer.
+ */
+void
+xfs_buf_item_flush_log_debug(
+	xfs_buf_t	*bp,
+	uint		first,
+	uint		last)
+{
+	xfs_buf_log_item_t	*bip;
+	uint			nbytes;
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+	if ((bip == NULL) || (bip->bli_item.li_type != XFS_LI_BUF)) {
+		return;
+	}
+
+	ASSERT(bip->bli_logged != NULL);
+	nbytes = last - first + 1;
+	bfset(bip->bli_logged, first, nbytes);
+}
+
+/*
+ * This function is called to verify that our caller's have logged
+ * all the bytes that they changed.
+ *
+ * It does this by comparing the original copy of the buffer stored in
+ * the buf log item's bli_orig array to the current copy of the buffer
+ * and ensuring that all bytes which miscompare are set in the bli_logged
+ * array of the buf log item.
+ */
+STATIC void
+xfs_buf_item_log_check(
+	xfs_buf_log_item_t	*bip)
+{
+	char		*orig;
+	char		*buffer;
+	int		x;
+	xfs_buf_t	*bp;
+
+	ASSERT(bip->bli_orig != NULL);
+	ASSERT(bip->bli_logged != NULL);
+
+	bp = bip->bli_buf;
+	ASSERT(XFS_BUF_COUNT(bp) > 0);
+	ASSERT(XFS_BUF_PTR(bp) != NULL);
+	orig = bip->bli_orig;
+	buffer = XFS_BUF_PTR(bp);
+	for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
+		if (orig[x] != buffer[x] && !btst(bip->bli_logged, x))
+			cmn_err(CE_PANIC,
+	"xfs_buf_item_log_check bip %x buffer %x orig %x index %d",
+				bip, bp, orig, x);
+	}
+}
+#else
+#define		xfs_buf_item_log_debug(x,y,z)
+#define		xfs_buf_item_log_check(x)
+#endif
+
+STATIC void	xfs_buf_error_relse(xfs_buf_t *bp);
+
+/*
+ * This returns the number of log iovecs needed to log the
+ * given buf log item.
+ *
+ * It calculates this as 1 iovec for the buf log format structure
+ * and 1 for each stretch of non-contiguous chunks to be logged.
+ * Contiguous chunks are logged in a single iovec.
+ *
+ * If the XFS_BLI_STALE flag has been set, then log nothing.
+ */
+uint
+xfs_buf_item_size(
+	xfs_buf_log_item_t	*bip)
+{
+	uint		nvecs;
+	int		next_bit;
+	int		last_bit;
+	xfs_buf_t	*bp;
+
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		/*
+		 * The buffer is stale, so all we need to log
+		 * is the buf log format structure with the
+		 * cancel flag in it.
+		 */
+		xfs_buf_item_trace("SIZE STALE", bip);
+		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+		return 1;
+	}
+
+	bp = bip->bli_buf;
+	ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
+	nvecs = 1;
+	last_bit = xfs_next_bit(bip->bli_format.blf_data_map,
+					 bip->bli_format.blf_map_size, 0);
+	ASSERT(last_bit != -1);
+	nvecs++;
+	while (last_bit != -1) {
+		/*
+		 * This takes the bit number to start looking from and
+		 * returns the next set bit from there.	 It returns -1
+		 * if there are no more bits set or the start bit is
+		 * beyond the end of the bitmap.
+		 */
+		next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
+						 bip->bli_format.blf_map_size,
+						 last_bit + 1);
+		/*
+		 * If we run out of bits, leave the loop,
+		 * else if we find a new set of bits bump the number of vecs,
+		 * else keep scanning the current set of bits.
+		 */
+		if (next_bit == -1) {
+			last_bit = -1;
+		} else if (next_bit != last_bit + 1) {
+			last_bit = next_bit;
+			nvecs++;
+		} else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) !=
+			   (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) +
+			    XFS_BLI_CHUNK)) {
+			last_bit = next_bit;
+			nvecs++;
+		} else {
+			last_bit++;
+		}
+	}
+
+	xfs_buf_item_trace("SIZE NORM", bip);
+	return nvecs;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given log buf item.	It fills the first entry with a buf log
+ * format structure, and the rest point to contiguous chunks
+ * within the buffer.
+ */
+void
+xfs_buf_item_format(
+	xfs_buf_log_item_t	*bip,
+	xfs_log_iovec_t		*log_vector)
+{
+	uint		base_size;
+	uint		nvecs;
+	xfs_log_iovec_t *vecp;
+	xfs_buf_t	*bp;
+	int		first_bit;
+	int		last_bit;
+	int		next_bit;
+	uint		nbits;
+	uint		buffer_offset;
+
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+	       (bip->bli_flags & XFS_BLI_STALE));
+	bp = bip->bli_buf;
+	ASSERT(XFS_BUF_BP_ISMAPPED(bp));
+	vecp = log_vector;
+
+	/*
+	 * The size of the base structure is the size of the
+	 * declared structure plus the space for the extra words
+	 * of the bitmap.  We subtract one from the map size, because
+	 * the first element of the bitmap is accounted for in the
+	 * size of the base structure.
+	 */
+	base_size =
+		(uint)(sizeof(xfs_buf_log_format_t) +
+		       ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
+	vecp->i_addr = (xfs_caddr_t)&bip->bli_format;
+	vecp->i_len = base_size;
+	vecp++;
+	nvecs = 1;
+
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		/*
+		 * The buffer is stale, so all we need to log
+		 * is the buf log format structure with the
+		 * cancel flag in it.
+		 */
+		xfs_buf_item_trace("FORMAT STALE", bip);
+		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+		bip->bli_format.blf_size = nvecs;
+		return;
+	}
+
+	/*
+	 * Fill in an iovec for each set of contiguous chunks.
+	 */
+	first_bit = xfs_next_bit(bip->bli_format.blf_data_map,
+					 bip->bli_format.blf_map_size, 0);
+	ASSERT(first_bit != -1);
+	last_bit = first_bit;
+	nbits = 1;
+	for (;;) {
+		/*
+		 * This takes the bit number to start looking from and
+		 * returns the next set bit from there.	 It returns -1
+		 * if there are no more bits set or the start bit is
+		 * beyond the end of the bitmap.
+		 */
+		next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
+						 bip->bli_format.blf_map_size,
+						 (uint)last_bit + 1);
+		/*
+		 * If we run out of bits fill in the last iovec and get
+		 * out of the loop.
+		 * Else if we start a new set of bits then fill in the
+		 * iovec for the series we were looking at and start
+		 * counting the bits in the new one.
+		 * Else we're still in the same set of bits so just
+		 * keep counting and scanning.
+		 */
+		if (next_bit == -1) {
+			buffer_offset = first_bit * XFS_BLI_CHUNK;
+			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
+			vecp->i_len = nbits * XFS_BLI_CHUNK;
+			nvecs++;
+			break;
+		} else if (next_bit != last_bit + 1) {
+			buffer_offset = first_bit * XFS_BLI_CHUNK;
+			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
+			vecp->i_len = nbits * XFS_BLI_CHUNK;
+			nvecs++;
+			vecp++;
+			first_bit = next_bit;
+			last_bit = next_bit;
+			nbits = 1;
+		} else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) !=
+			   (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) +
+			    XFS_BLI_CHUNK)) {
+			buffer_offset = first_bit * XFS_BLI_CHUNK;
+			vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
+			vecp->i_len = nbits * XFS_BLI_CHUNK;
+/* You would think we need to bump the nvecs here too, but we do not
+ * this number is used by recovery, and it gets confused by the boundary
+ * split here
+ *			nvecs++;
+ */
+			vecp++;
+			first_bit = next_bit;
+			last_bit = next_bit;
+		} else {
+			last_bit++;
+			nbits++;
+		}
+	}
+	bip->bli_format.blf_size = nvecs;
+
+	/*
+	 * Check to make sure everything is consistent.
+	 */
+	xfs_buf_item_trace("FORMAT NORM", bip);
+	xfs_buf_item_log_check(bip);
+}
+
+/*
+ * This is called to pin the buffer associated with the buf log
+ * item in memory so it cannot be written out.	Simply call bpin()
+ * on the buffer to do this.
+ */
+void
+xfs_buf_item_pin(
+	xfs_buf_log_item_t	*bip)
+{
+	xfs_buf_t	*bp;
+
+	bp = bip->bli_buf;
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
+	       (bip->bli_flags & XFS_BLI_STALE));
+	xfs_buf_item_trace("PIN", bip);
+	xfs_buftrace("XFS_PIN", bp);
+	xfs_bpin(bp);
+}
+
+
+/*
+ * This is called to unpin the buffer associated with the buf log
+ * item which was previously pinned with a call to xfs_buf_item_pin().
+ * Just call bunpin() on the buffer to do this.
+ *
+ * Also drop the reference to the buf item for the current transaction.
+ * If the XFS_BLI_STALE flag is set and we are the last reference,
+ * then free up the buf log item and unlock the buffer.
+ */
+void
+xfs_buf_item_unpin(
+	xfs_buf_log_item_t	*bip)
+{
+	xfs_mount_t	*mp;
+	xfs_buf_t	*bp;
+	int		freed;
+	SPLDECL(s);
+
+	bp = bip->bli_buf;
+	ASSERT(bp != NULL);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	xfs_buf_item_trace("UNPIN", bip);
+	xfs_buftrace("XFS_UNPIN", bp);
+
+	freed = atomic_dec_and_test(&bip->bli_refcount);
+	mp = bip->bli_item.li_mountp;
+	xfs_bunpin(bp);
+	if (freed && (bip->bli_flags & XFS_BLI_STALE)) {
+		ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+		ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
+		ASSERT(XFS_BUF_ISSTALE(bp));
+/**
+		ASSERT(bp->b_pincount == 0);
+**/
+		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+		xfs_buf_item_trace("UNPIN STALE", bip);
+		xfs_buftrace("XFS_UNPIN STALE", bp);
+		AIL_LOCK(mp,s);
+		/*
+		 * If we get called here because of an IO error, we may
+		 * or may not have the item on the AIL. xfs_trans_delete_ail()
+		 * will take care of that situation.
+		 * xfs_trans_delete_ail() drops the AIL lock.
+		 */
+		xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip, s);
+		xfs_buf_item_relse(bp);
+		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL);
+		xfs_buf_relse(bp);
+	}
+
+}
+
+/*
+ * this is called from uncommit in the forced-shutdown path.
+ * we need to check to see if the reference count on the log item
+ * is going to drop to zero.  If so, unpin will free the log item
+ * so we need to free the item's descriptor (that points to the item)
+ * in the transaction.
+ */
+void
+xfs_buf_item_unpin_remove(
+	xfs_buf_log_item_t	*bip,
+	xfs_trans_t		*tp)
+{
+	xfs_buf_t		*bp;
+	xfs_log_item_desc_t	*lidp;
+
+	bp = bip->bli_buf;
+	/*
+	 * will xfs_buf_item_unpin() call xfs_buf_item_relse()?
+	 */
+	if ((atomic_read(&bip->bli_refcount) == 1) &&
+	    (bip->bli_flags & XFS_BLI_STALE)) {
+		ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
+		xfs_buf_item_trace("UNPIN REMOVE", bip);
+		xfs_buftrace("XFS_UNPIN_REMOVE", bp);
+		/*
+		 * yes -- clear the xaction descriptor in-use flag
+		 * and free the chunk if required.  We can safely
+		 * do some work here and then call buf_item_unpin
+		 * to do the rest because if the if is true, then
+		 * we are holding the buffer locked so no one else
+		 * will be able to bump up the refcount.
+		 */
+		lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip);
+		xfs_trans_free_item(tp, lidp);
+		/*
+		 * Since the transaction no longer refers to the buffer,
+		 * the buffer should no longer refer to the transaction.
+		 */
+		XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+	}
+
+	xfs_buf_item_unpin(bip);
+
+	return;
+}
+
+/*
+ * This is called to attempt to lock the buffer associated with this
+ * buf log item.  Don't sleep on the buffer lock.  If we can't get
+ * the lock right away, return 0.  If we can get the lock, pull the
+ * buffer from the free list, mark it busy, and return 1.
+ */
+uint
+xfs_buf_item_trylock(
+	xfs_buf_log_item_t	*bip)
+{
+	xfs_buf_t	*bp;
+
+	bp = bip->bli_buf;
+
+	if (XFS_BUF_ISPINNED(bp)) {
+		return XFS_ITEM_PINNED;
+	}
+
+	if (!XFS_BUF_CPSEMA(bp)) {
+		return XFS_ITEM_LOCKED;
+	}
+
+	/*
+	 * Remove the buffer from the free list.  Only do this
+	 * if it's on the free list.  Private buffers like the
+	 * superblock buffer are not.
+	 */
+	XFS_BUF_HOLD(bp);
+
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	xfs_buf_item_trace("TRYLOCK SUCCESS", bip);
+	return XFS_ITEM_SUCCESS;
+}
+
+/*
+ * Release the buffer associated with the buf log item.
+ * If there is no dirty logged data associated with the
+ * buffer recorded in the buf log item, then free the
+ * buf log item and remove the reference to it in the
+ * buffer.
+ *
+ * This call ignores the recursion count.  It is only called
+ * when the buffer should REALLY be unlocked, regardless
+ * of the recursion count.
+ *
+ * If the XFS_BLI_HOLD flag is set in the buf log item, then
+ * free the log item if necessary but do not unlock the buffer.
+ * This is for support of xfs_trans_bhold(). Make sure the
+ * XFS_BLI_HOLD field is cleared if we don't free the item.
+ */
+void
+xfs_buf_item_unlock(
+	xfs_buf_log_item_t	*bip)
+{
+	int		aborted;
+	xfs_buf_t	*bp;
+	uint		hold;
+
+	bp = bip->bli_buf;
+	xfs_buftrace("XFS_UNLOCK", bp);
+
+	/*
+	 * Clear the buffer's association with this transaction.
+	 */
+	XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+
+	/*
+	 * If this is a transaction abort, don't return early.
+	 * Instead, allow the brelse to happen.
+	 * Normally it would be done for stale (cancelled) buffers
+	 * at unpin time, but we'll never go through the pin/unpin
+	 * cycle if we abort inside commit.
+	 */
+	aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
+
+	/*
+	 * If the buf item is marked stale, then don't do anything.
+	 * We'll unlock the buffer and free the buf item when the
+	 * buffer is unpinned for the last time.
+	 */
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		bip->bli_flags &= ~XFS_BLI_LOGGED;
+		xfs_buf_item_trace("UNLOCK STALE", bip);
+		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+		if (!aborted)
+			return;
+	}
+
+	/*
+	 * Drop the transaction's reference to the log item if
+	 * it was not logged as part of the transaction.  Otherwise
+	 * we'll drop the reference in xfs_buf_item_unpin() when
+	 * the transaction is really through with the buffer.
+	 */
+	if (!(bip->bli_flags & XFS_BLI_LOGGED)) {
+		atomic_dec(&bip->bli_refcount);
+	} else {
+		/*
+		 * Clear the logged flag since this is per
+		 * transaction state.
+		 */
+		bip->bli_flags &= ~XFS_BLI_LOGGED;
+	}
+
+	/*
+	 * Before possibly freeing the buf item, determine if we should
+	 * release the buffer at the end of this routine.
+	 */
+	hold = bip->bli_flags & XFS_BLI_HOLD;
+	xfs_buf_item_trace("UNLOCK", bip);
+
+	/*
+	 * If the buf item isn't tracking any data, free it.
+	 * Otherwise, if XFS_BLI_HOLD is set clear it.
+	 */
+	if (xfs_count_bits(bip->bli_format.blf_data_map,
+			      bip->bli_format.blf_map_size, 0) == 0) {
+		xfs_buf_item_relse(bp);
+	} else if (hold) {
+		bip->bli_flags &= ~XFS_BLI_HOLD;
+	}
+
+	/*
+	 * Release the buffer if XFS_BLI_HOLD was not set.
+	 */
+	if (!hold) {
+		xfs_buf_relse(bp);
+	}
+}
+
+/*
+ * This is called to find out where the oldest active copy of the
+ * buf log item in the on disk log resides now that the last log
+ * write of it completed at the given lsn.
+ * We always re-log all the dirty data in a buffer, so usually the
+ * latest copy in the on disk log is the only one that matters.	 For
+ * those cases we simply return the given lsn.
+ *
+ * The one exception to this is for buffers full of newly allocated
+ * inodes.  These buffers are only relogged with the XFS_BLI_INODE_BUF
+ * flag set, indicating that only the di_next_unlinked fields from the
+ * inodes in the buffers will be replayed during recovery.  If the
+ * original newly allocated inode images have not yet been flushed
+ * when the buffer is so relogged, then we need to make sure that we
+ * keep the old images in the 'active' portion of the log.  We do this
+ * by returning the original lsn of that transaction here rather than
+ * the current one.
+ */
+xfs_lsn_t
+xfs_buf_item_committed(
+	xfs_buf_log_item_t	*bip,
+	xfs_lsn_t		lsn)
+{
+	xfs_buf_item_trace("COMMITTED", bip);
+	if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+	    (bip->bli_item.li_lsn != 0)) {
+		return bip->bli_item.li_lsn;
+	}
+	return (lsn);
+}
+
+/*
+ * This is called when the transaction holding the buffer is aborted.
+ * Just behave as if the transaction had been cancelled. If we're shutting down
+ * and have aborted this transaction, we'll trap this buffer when it tries to
+ * get written out.
+ */
+void
+xfs_buf_item_abort(
+	xfs_buf_log_item_t	*bip)
+{
+	xfs_buf_t	*bp;
+
+	bp = bip->bli_buf;
+	xfs_buftrace("XFS_ABORT", bp);
+	XFS_BUF_SUPER_STALE(bp);
+	xfs_buf_item_unlock(bip);
+	return;
+}
+
+/*
+ * This is called to asynchronously write the buffer associated with this
+ * buf log item out to disk. The buffer will already have been locked by
+ * a successful call to xfs_buf_item_trylock().	 If the buffer still has
+ * B_DELWRI set, then get it going out to disk with a call to bawrite().
+ * If not, then just release the buffer.
+ */
+void
+xfs_buf_item_push(
+	xfs_buf_log_item_t	*bip)
+{
+	xfs_buf_t	*bp;
+
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	xfs_buf_item_trace("PUSH", bip);
+
+	bp = bip->bli_buf;
+
+	if (XFS_BUF_ISDELAYWRITE(bp)) {
+		xfs_bawrite(bip->bli_item.li_mountp, bp);
+	} else {
+		xfs_buf_relse(bp);
+	}
+}
+
+/* ARGSUSED */
+void
+xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all buf log items.
+ */
+struct xfs_item_ops xfs_buf_item_ops = {
+	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_buf_item_size,
+	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+					xfs_buf_item_format,
+	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_buf_item_pin,
+	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_buf_item_unpin,
+	.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
+					xfs_buf_item_unpin_remove,
+	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock,
+	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_buf_item_unlock,
+	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_buf_item_committed,
+	.iop_push	= (void(*)(xfs_log_item_t*))xfs_buf_item_push,
+	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_buf_item_abort,
+	.iop_pushbuf	= NULL,
+	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_buf_item_committing
+};
+
+
+/*
+ * Allocate a new buf log item to go with the given buffer.
+ * Set the buffer's b_fsprivate field to point to the new
+ * buf log item.  If there are other item's attached to the
+ * buffer (see xfs_buf_attach_iodone() below), then put the
+ * buf log item at the front.
+ */
+void
+xfs_buf_item_init(
+	xfs_buf_t	*bp,
+	xfs_mount_t	*mp)
+{
+	xfs_log_item_t		*lip;
+	xfs_buf_log_item_t	*bip;
+	int			chunks;
+	int			map_size;
+
+	/*
+	 * Check to see if there is already a buf log item for
+	 * this buffer.	 If there is, it is guaranteed to be
+	 * the first.  If we do already have one, there is
+	 * nothing to do here so return.
+	 */
+	if (XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *) != mp)
+		XFS_BUF_SET_FSPRIVATE3(bp, mp);
+	XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
+	if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
+		lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+		if (lip->li_type == XFS_LI_BUF) {
+			return;
+		}
+	}
+
+	/*
+	 * chunks is the number of XFS_BLI_CHUNK size pieces
+	 * the buffer can be divided into. Make sure not to
+	 * truncate any pieces.	 map_size is the size of the
+	 * bitmap needed to describe the chunks of the buffer.
+	 */
+	chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT);
+	map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
+
+	bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
+						    KM_SLEEP);
+	bip->bli_item.li_type = XFS_LI_BUF;
+	bip->bli_item.li_ops = &xfs_buf_item_ops;
+	bip->bli_item.li_mountp = mp;
+	bip->bli_buf = bp;
+	bip->bli_format.blf_type = XFS_LI_BUF;
+	bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
+	bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
+	bip->bli_format.blf_map_size = map_size;
+#ifdef XFS_BLI_TRACE
+	bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_SLEEP);
+#endif
+
+#ifdef XFS_TRANS_DEBUG
+	/*
+	 * Allocate the arrays for tracking what needs to be logged
+	 * and what our callers request to be logged.  bli_orig
+	 * holds a copy of the original, clean buffer for comparison
+	 * against, and bli_logged keeps a 1 bit flag per byte in
+	 * the buffer to indicate which bytes the callers have asked
+	 * to have logged.
+	 */
+	bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP);
+	bcopy(XFS_BUF_PTR(bp), bip->bli_orig, XFS_BUF_COUNT(bp));
+	bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP);
+#endif
+
+	/*
+	 * Put the buf item into the list of items attached to the
+	 * buffer at the front.
+	 */
+	if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
+		bip->bli_item.li_bio_list =
+				XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+	}
+	XFS_BUF_SET_FSPRIVATE(bp, bip);
+}
+
+
+/*
+ * Mark bytes first through last inclusive as dirty in the buf
+ * item's bitmap.
+ */
+void
+xfs_buf_item_log(
+	xfs_buf_log_item_t	*bip,
+	uint			first,
+	uint			last)
+{
+	uint		first_bit;
+	uint		last_bit;
+	uint		bits_to_set;
+	uint		bits_set;
+	uint		word_num;
+	uint		*wordp;
+	uint		bit;
+	uint		end_bit;
+	uint		mask;
+
+	/*
+	 * Mark the item as having some dirty data for
+	 * quick reference in xfs_buf_item_dirty.
+	 */
+	bip->bli_flags |= XFS_BLI_DIRTY;
+
+	/*
+	 * Convert byte offsets to bit numbers.
+	 */
+	first_bit = first >> XFS_BLI_SHIFT;
+	last_bit = last >> XFS_BLI_SHIFT;
+
+	/*
+	 * Calculate the total number of bits to be set.
+	 */
+	bits_to_set = last_bit - first_bit + 1;
+
+	/*
+	 * Get a pointer to the first word in the bitmap
+	 * to set a bit in.
+	 */
+	word_num = first_bit >> BIT_TO_WORD_SHIFT;
+	wordp = &(bip->bli_format.blf_data_map[word_num]);
+
+	/*
+	 * Calculate the starting bit in the first word.
+	 */
+	bit = first_bit & (uint)(NBWORD - 1);
+
+	/*
+	 * First set any bits in the first word of our range.
+	 * If it starts at bit 0 of the word, it will be
+	 * set below rather than here.	That is what the variable
+	 * bit tells us. The variable bits_set tracks the number
+	 * of bits that have been set so far.  End_bit is the number
+	 * of the last bit to be set in this word plus one.
+	 */
+	if (bit) {
+		end_bit = MIN(bit + bits_to_set, (uint)NBWORD);
+		mask = ((1 << (end_bit - bit)) - 1) << bit;
+		*wordp |= mask;
+		wordp++;
+		bits_set = end_bit - bit;
+	} else {
+		bits_set = 0;
+	}
+
+	/*
+	 * Now set bits a whole word at a time that are between
+	 * first_bit and last_bit.
+	 */
+	while ((bits_to_set - bits_set) >= NBWORD) {
+		*wordp |= 0xffffffff;
+		bits_set += NBWORD;
+		wordp++;
+	}
+
+	/*
+	 * Finally, set any bits left to be set in one last partial word.
+	 */
+	end_bit = bits_to_set - bits_set;
+	if (end_bit) {
+		mask = (1 << end_bit) - 1;
+		*wordp |= mask;
+	}
+
+	xfs_buf_item_log_debug(bip, first, last);
+}
+
+
+/*
+ * Return 1 if the buffer has some data that has been logged (at any
+ * point, not just the current transaction) and 0 if not.
+ */
+uint
+xfs_buf_item_dirty(
+	xfs_buf_log_item_t	*bip)
+{
+	return (bip->bli_flags & XFS_BLI_DIRTY);
+}
+
+/*
+ * This is called when the buf log item is no longer needed.  It should
+ * free the buf log item associated with the given buffer and clear
+ * the buffer's pointer to the buf log item.  If there are no more
+ * items in the list, clear the b_iodone field of the buffer (see
+ * xfs_buf_attach_iodone() below).
+ */
+void
+xfs_buf_item_relse(
+	xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip;
+
+	xfs_buftrace("XFS_RELSE", bp);
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+	XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list);
+	if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) &&
+	    (XFS_BUF_IODONE_FUNC(bp) != NULL)) {
+/**
+		ASSERT((XFS_BUF_ISUNINITIAL(bp)) == 0);
+***/
+		XFS_BUF_CLR_IODONE_FUNC(bp);
+	}
+
+#ifdef XFS_TRANS_DEBUG
+	kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp));
+	bip->bli_orig = NULL;
+	kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY);
+	bip->bli_logged = NULL;
+#endif /* XFS_TRANS_DEBUG */
+
+#ifdef XFS_BLI_TRACE
+	ktrace_free(bip->bli_trace);
+#endif
+	kmem_zone_free(xfs_buf_item_zone, bip);
+}
+
+
+/*
+ * Add the given log item with it's callback to the list of callbacks
+ * to be called when the buffer's I/O completes.  If it is not set
+ * already, set the buffer's b_iodone() routine to be
+ * xfs_buf_iodone_callbacks() and link the log item into the list of
+ * items rooted at b_fsprivate.	 Items are always added as the second
+ * entry in the list if there is a first, because the buf item code
+ * assumes that the buf log item is first.
+ */
+void
+xfs_buf_attach_iodone(
+	xfs_buf_t	*bp,
+	void		(*cb)(xfs_buf_t *, xfs_log_item_t *),
+	xfs_log_item_t	*lip)
+{
+	xfs_log_item_t	*head_lip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+
+	lip->li_cb = cb;
+	if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
+		head_lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+		lip->li_bio_list = head_lip->li_bio_list;
+		head_lip->li_bio_list = lip;
+	} else {
+		XFS_BUF_SET_FSPRIVATE(bp, lip);
+	}
+
+	ASSERT((XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks) ||
+	       (XFS_BUF_IODONE_FUNC(bp) == NULL));
+	XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
+}
+
+STATIC void
+xfs_buf_do_callbacks(
+	xfs_buf_t	*bp,
+	xfs_log_item_t	*lip)
+{
+	xfs_log_item_t	*nlip;
+
+	while (lip != NULL) {
+		nlip = lip->li_bio_list;
+		ASSERT(lip->li_cb != NULL);
+		/*
+		 * Clear the next pointer so we don't have any
+		 * confusion if the item is added to another buf.
+		 * Don't touch the log item after calling its
+		 * callback, because it could have freed itself.
+		 */
+		lip->li_bio_list = NULL;
+		lip->li_cb(bp, lip);
+		lip = nlip;
+	}
+}
+
+/*
+ * This is the iodone() function for buffers which have had callbacks
+ * attached to them by xfs_buf_attach_iodone().	 It should remove each
+ * log item from the buffer's list and call the callback of each in turn.
+ * When done, the buffer's fsprivate field is set to NULL and the buffer
+ * is unlocked with a call to iodone().
+ */
+void
+xfs_buf_iodone_callbacks(
+	xfs_buf_t	*bp)
+{
+	xfs_log_item_t	*lip;
+	static time_t	lasttime;
+	static dev_t	lastdev;
+	xfs_mount_t	*mp;
+
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+	lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+
+	if (XFS_BUF_GETERROR(bp) != 0) {
+		/*
+		 * If we've already decided to shutdown the filesystem
+		 * because of IO errors, there's no point in giving this
+		 * a retry.
+		 */
+		mp = lip->li_mountp;
+		if (XFS_FORCED_SHUTDOWN(mp)) {
+			ASSERT(XFS_BUF_TARGET_DEV(bp) == mp->m_dev);
+			XFS_BUF_SUPER_STALE(bp);
+			xfs_buftrace("BUF_IODONE_CB", bp);
+			xfs_buf_do_callbacks(bp, lip);
+			XFS_BUF_SET_FSPRIVATE(bp, NULL);
+			XFS_BUF_CLR_IODONE_FUNC(bp);
+
+			/*
+			 * XFS_SHUT flag gets set when we go thru the
+			 * entire buffer cache and deliberately start
+			 * throwing away delayed write buffers.
+			 * Since there's no biowait done on those,
+			 * we should just brelse them.
+			 */
+			if (XFS_BUF_ISSHUT(bp)) {
+			    XFS_BUF_UNSHUT(bp);
+				xfs_buf_relse(bp);
+			} else {
+				xfs_biodone(bp);
+			}
+
+			return;
+		}
+
+		if ((XFS_BUF_TARGET_DEV(bp) != lastdev) ||
+		    ((lbolt - lasttime) > 500)) {
+			prdev("XFS write error in file system meta-data "
+			      "block 0x%Lx in %s",
+			      XFS_BUF_TARGET_DEV(bp),
+			      XFS_BUF_ADDR(bp), mp->m_fsname);
+			lasttime = lbolt;
+		}
+		lastdev = XFS_BUF_TARGET_DEV(bp);
+
+		if (XFS_BUF_ISASYNC(bp)) {
+			/*
+			 * If the write was asynchronous then noone will be
+			 * looking for the error.  Clear the error state
+			 * and write the buffer out again delayed write.
+			 *
+			 * XXXsup This is OK, so long as we catch these
+			 * before we start the umount; we don't want these
+			 * DELWRI metadata bufs to be hanging around.
+			 */
+			XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */
+
+			if (!(XFS_BUF_ISSTALE(bp))) {
+				XFS_BUF_DELAYWRITE(bp);
+				XFS_BUF_DONE(bp);
+				XFS_BUF_SET_START(bp);
+			}
+			ASSERT(XFS_BUF_IODONE_FUNC(bp));
+			xfs_buftrace("BUF_IODONE ASYNC", bp);
+			xfs_buf_relse(bp);
+		} else {
+			/*
+			 * If the write of the buffer was not asynchronous,
+			 * then we want to make sure to return the error
+			 * to the caller of bwrite().  Because of this we
+			 * cannot clear the B_ERROR state at this point.
+			 * Instead we install a callback function that
+			 * will be called when the buffer is released, and
+			 * that routine will clear the error state and
+			 * set the buffer to be written out again after
+			 * some delay.
+			 */
+			/* We actually overwrite the existing b-relse
+			   function at times, but we're gonna be shutting down
+			   anyway. */
+			XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
+			XFS_BUF_DONE(bp);
+			XFS_BUF_V_IODONESEMA(bp);
+		}
+		return;
+	}
+#ifdef XFSERRORDEBUG
+	xfs_buftrace("XFS BUFCB NOERR", bp);
+#endif
+	xfs_buf_do_callbacks(bp, lip);
+	XFS_BUF_SET_FSPRIVATE(bp, NULL);
+	XFS_BUF_CLR_IODONE_FUNC(bp);
+	xfs_biodone(bp);
+}
+
+/*
+ * This is a callback routine attached to a buffer which gets an error
+ * when being written out synchronously.
+ */
+STATIC void
+xfs_buf_error_relse(
+	xfs_buf_t	*bp)
+{
+	xfs_log_item_t	*lip;
+	xfs_mount_t	*mp;
+
+	lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+	mp = (xfs_mount_t *)lip->li_mountp;
+	ASSERT(XFS_BUF_TARGET_DEV(bp) == mp->m_dev);
+
+	XFS_BUF_STALE(bp);
+	XFS_BUF_DONE(bp);
+	XFS_BUF_UNDELAYWRITE(bp);
+	XFS_BUF_ERROR(bp,0);
+	xfs_buftrace("BUF_ERROR_RELSE", bp);
+	if (! XFS_FORCED_SHUTDOWN(mp))
+		xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
+	/*
+	 * We have to unpin the pinned buffers so do the
+	 * callbacks.
+	 */
+	xfs_buf_do_callbacks(bp, lip);
+	XFS_BUF_SET_FSPRIVATE(bp, NULL);
+	XFS_BUF_CLR_IODONE_FUNC(bp);
+	XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
+	xfs_buf_relse(bp);
+}
+
+
+/*
+ * This is the iodone() function for buffers which have been
+ * logged.  It is called when they are eventually flushed out.
+ * It should remove the buf item from the AIL, and free the buf item.
+ * It is called by xfs_buf_iodone_callbacks() above which will take
+ * care of cleaning up the buffer itself.
+ */
+/* ARGSUSED */
+void
+xfs_buf_iodone(
+	xfs_buf_t		*bp,
+	xfs_buf_log_item_t	*bip)
+{
+	struct xfs_mount	*mp;
+	SPLDECL(s);
+
+	ASSERT(bip->bli_buf == bp);
+
+	mp = bip->bli_item.li_mountp;
+
+	/*
+	 * If we are forcibly shutting down, this may well be
+	 * off the AIL already. That's because we simulate the
+	 * log-committed callbacks to unpin these buffers. Or we may never
+	 * have put this item on AIL because of the transaction was
+	 * aborted forcibly. xfs_trans_delete_ail() takes care of these.
+	 *
+	 * Either way, AIL is useless if we're forcing a shutdown.
+	 */
+	AIL_LOCK(mp,s);
+	/*
+	 * xfs_trans_delete_ail() drops the AIL lock.
+	 */
+	xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip, s);
+
+#ifdef XFS_TRANS_DEBUG
+	kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp));
+	bip->bli_orig = NULL;
+	kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY);
+	bip->bli_logged = NULL;
+#endif /* XFS_TRANS_DEBUG */
+
+#ifdef XFS_BLI_TRACE
+	ktrace_free(bip->bli_trace);
+#endif
+	kmem_zone_free(xfs_buf_item_zone, bip);
+}
+
+#if defined(XFS_BLI_TRACE)
+void
+xfs_buf_item_trace(
+	char			*id,
+	xfs_buf_log_item_t	*bip)
+{
+	xfs_buf_t		*bp;
+	ASSERT(bip->bli_trace != NULL);
+
+	bp = bip->bli_buf;
+	ktrace_enter(bip->bli_trace,
+		     (void *)id,
+		     (void *)bip->bli_buf,
+		     (void *)((unsigned long)bip->bli_flags),
+		     (void *)((unsigned long)bip->bli_recur),
+		     (void *)((unsigned long)atomic_read(&bip->bli_refcount)),
+		     (void *)XFS_BUF_ADDR(bp),
+		     (void *)((unsigned long)XFS_BUF_COUNT(bp)),
+		     (void *)((unsigned long)(0xFFFFFFFF & (XFS_BFLAGS(bp) >> 32))),
+		     (void *)((unsigned long)(0xFFFFFFFF & XFS_BFLAGS(bp))),
+		     XFS_BUF_FSPRIVATE(bp, void *),
+		     XFS_BUF_FSPRIVATE2(bp, void *),
+		     (void *)((unsigned long)bp->b_pincount),
+		     (void *)XFS_BUF_IODONE_FUNC(bp),
+		     (void *)((unsigned long)(XFS_BUF_VALUSEMA(bp))),
+		     (void *)bip->bli_item.li_desc,
+		     (void *)((unsigned long)bip->bli_item.li_flags));
+}
+#endif /* XFS_BLI_TRACE */
+
+
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
new file mode 100644
index 000000000000..3cd9480f7212
--- /dev/null
+++ b/fs/xfs/xfs_buf_item.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_BUF_ITEM_H__
+#define __XFS_BUF_ITEM_H__
+
+/*
+ * This is the structure used to lay out a buf log item in the
+ * log.	 The data map describes which 128 byte chunks of the buffer
+ * have been logged.  This structure works only on buffers that
+ * reside up to the first TB in the filesystem.	 These buffers are
+ * generated only by pre-6.2 systems and are known as XFS_LI_6_1_BUF.
+ */
+typedef struct xfs_buf_log_format_v1 {
+	unsigned short	blf_type;	/* buf log item type indicator */
+	unsigned short	blf_size;	/* size of this item */
+	__int32_t	blf_blkno;	/* starting blkno of this buf */
+	ushort		blf_flags;	/* misc state */
+	ushort		blf_len;	/* number of blocks in this buf */
+	unsigned int	blf_map_size;	/* size of data bitmap in words */
+	unsigned int	blf_data_map[1];/* variable size bitmap of */
+					/*   regions of buffer in this item */
+} xfs_buf_log_format_v1_t;
+
+/*
+ * This is a form of the above structure with a 64 bit blkno field.
+ * For 6.2 and beyond, this is XFS_LI_BUF.  We use this to log everything.
+ */
+typedef struct xfs_buf_log_format_t {
+	unsigned short	blf_type;	/* buf log item type indicator */
+	unsigned short	blf_size;	/* size of this item */
+	ushort		blf_flags;	/* misc state */
+	ushort		blf_len;	/* number of blocks in this buf */
+	__int64_t	blf_blkno;	/* starting blkno of this buf */
+	unsigned int	blf_map_size;	/* size of data bitmap in words */
+	unsigned int	blf_data_map[1];/* variable size bitmap of */
+					/*   regions of buffer in this item */
+} xfs_buf_log_format_t;
+
+/*
+ * This flag indicates that the buffer contains on disk inodes
+ * and requires special recovery handling.
+ */
+#define XFS_BLI_INODE_BUF	0x1
+/*
+ * This flag indicates that the buffer should not be replayed
+ * during recovery because its blocks are being freed.
+ */
+#define XFS_BLI_CANCEL		0x2
+/*
+ * This flag indicates that the buffer contains on disk
+ * user or group dquots and may require special recovery handling.
+ */
+#define XFS_BLI_UDQUOT_BUF	0x4
+/* #define XFS_BLI_PDQUOT_BUF	0x8 */
+#define XFS_BLI_GDQUOT_BUF	0x10
+
+#define XFS_BLI_CHUNK		128
+#define XFS_BLI_SHIFT		7
+#define BIT_TO_WORD_SHIFT	5
+#define NBWORD			(NBBY * sizeof(unsigned int))
+
+/*
+ * buf log item flags
+ */
+#define XFS_BLI_HOLD		0x01
+#define XFS_BLI_DIRTY		0x02
+#define XFS_BLI_STALE		0x04
+#define XFS_BLI_LOGGED		0x08
+#define XFS_BLI_INODE_ALLOC_BUF 0x10
+
+
+#ifdef __KERNEL__
+
+struct xfs_buf;
+struct ktrace;
+struct xfs_mount;
+
+/*
+ * This is the in core log item structure used to track information
+ * needed to log buffers.  It tracks how many times the lock has been
+ * locked, and which 128 byte chunks of the buffer are dirty.
+ */
+typedef struct xfs_buf_log_item {
+	xfs_log_item_t		bli_item;	/* common item structure */
+	struct xfs_buf		*bli_buf;	/* real buffer pointer */
+	unsigned int		bli_flags;	/* misc flags */
+	unsigned int		bli_recur;	/* lock recursion count */
+	atomic_t		bli_refcount;	/* cnt of tp refs */
+#ifdef DEBUG
+	struct ktrace		*bli_trace;	/* event trace buf */
+#endif
+#ifdef XFS_TRANS_DEBUG
+	char			*bli_orig;	/* original buffer copy */
+	char			*bli_logged;	/* bytes logged (bitmap) */
+#endif
+	xfs_buf_log_format_t	bli_format;	/* in-log header */
+} xfs_buf_log_item_t;
+
+/*
+ * This structure is used during recovery to record the buf log
+ * items which have been canceled and should not be replayed.
+ */
+typedef struct xfs_buf_cancel {
+	xfs_daddr_t			bc_blkno;
+	uint			bc_len;
+	int			bc_refcount;
+	struct xfs_buf_cancel	*bc_next;
+} xfs_buf_cancel_t;
+
+#define XFS_BLI_TRACE_SIZE	32
+
+
+#if defined(XFS_ALL_TRACE)
+#define XFS_BLI_TRACE
+#endif
+
+#if !defined(DEBUG)
+#undef XFS_BLI_TRACE
+#endif
+
+#if defined(XFS_BLI_TRACE)
+void	xfs_buf_item_trace(char *, xfs_buf_log_item_t *);
+#else
+#define xfs_buf_item_trace(id, bip)
+#endif
+
+void	xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
+void	xfs_buf_item_relse(struct xfs_buf *);
+void	xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
+uint	xfs_buf_item_dirty(xfs_buf_log_item_t *);
+void	xfs_buf_attach_iodone(struct xfs_buf *,
+			      void(*)(struct xfs_buf *, xfs_log_item_t *),
+			      xfs_log_item_t *);
+void	xfs_buf_iodone_callbacks(struct xfs_buf *);
+void	xfs_buf_iodone(struct xfs_buf *, xfs_buf_log_item_t *);
+
+#ifdef XFS_TRANS_DEBUG
+void
+xfs_buf_item_flush_log_debug(
+	struct xfs_buf *bp,
+	uint	first,
+	uint	last);
+#else
+#define xfs_buf_item_flush_log_debug(bp, first, last)
+#endif
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_BUF_ITEM_H__ */
diff --git a/fs/xfs/xfs_cap.c b/fs/xfs/xfs_cap.c
new file mode 100644
index 000000000000..39170a35ab5e
--- /dev/null
+++ b/fs/xfs/xfs_cap.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+STATIC int xfs_cap_allow_set(vnode_t *);
+
+
+/*
+ * Test for existence of capability attribute as efficiently as possible.
+ */
+int
+xfs_cap_vhascap(
+	vnode_t		*vp)
+{
+	int		error;
+	int		len = sizeof(xfs_cap_set_t);
+	int		flags = ATTR_KERNOVAL|ATTR_ROOT;
+
+	VOP_ATTR_GET(vp, SGI_CAP_LINUX, NULL, &len, flags, sys_cred, error);
+	return (error == 0);
+}
+
+/*
+ * Convert from extended attribute representation to in-memory for XFS.
+ */
+STATIC int
+posix_cap_xattr_to_xfs(
+	posix_cap_xattr		*src,
+	size_t			size,
+	xfs_cap_set_t		*dest)
+{
+	if (!src || !dest)
+		return EINVAL;
+
+	if (src->c_version != cpu_to_le32(POSIX_CAP_XATTR_VERSION))
+		return EINVAL;
+	if (src->c_abiversion != cpu_to_le32(_LINUX_CAPABILITY_VERSION))
+		return EINVAL;
+
+	if (size < sizeof(posix_cap_xattr))
+		return EINVAL;
+
+	ASSERT(sizeof(dest->cap_effective) == sizeof(src->c_effective));
+
+	dest->cap_effective	= src->c_effective;
+	dest->cap_permitted	= src->c_permitted;
+	dest->cap_inheritable	= src->c_inheritable;
+
+	return 0;
+}
+
+/*
+ * Convert from in-memory XFS to extended attribute representation.
+ */
+STATIC int
+posix_cap_xfs_to_xattr(
+	xfs_cap_set_t		*src,
+	posix_cap_xattr		*xattr_cap,
+	size_t			size)
+{
+	size_t			new_size = posix_cap_xattr_size();
+
+	if (size < new_size)
+		return -ERANGE;
+
+	ASSERT(sizeof(xattr_cap->c_effective) == sizeof(src->cap_effective));
+
+	xattr_cap->c_version	= cpu_to_le32(POSIX_CAP_XATTR_VERSION);
+	xattr_cap->c_abiversion = cpu_to_le32(_LINUX_CAPABILITY_VERSION);
+	xattr_cap->c_effective	= src->cap_effective;
+	xattr_cap->c_permitted	= src->cap_permitted;
+	xattr_cap->c_inheritable= src->cap_inheritable;
+
+	return new_size;
+}
+
+int
+xfs_cap_vget(
+	vnode_t		*vp,
+	void		*cap,
+	size_t		size)
+{
+	int		error;
+	int		len = sizeof(xfs_cap_set_t);
+	int		flags = ATTR_ROOT;
+	xfs_cap_set_t	xfs_cap = { 0 };
+	posix_cap_xattr *xattr_cap = cap;
+
+	VN_HOLD(vp);
+	if ((error = _MAC_VACCESS(vp, NULL, VREAD)))
+		goto out;
+
+	if (!size)
+		flags |= ATTR_KERNOVAL;
+	VOP_ATTR_GET(vp, SGI_CAP_LINUX, (char *)&xfs_cap,
+			&len, flags, sys_cred, error);
+	if (error)
+		goto out;
+	ASSERT(len == sizeof(xfs_cap_set_t));
+
+	error = (size)? -posix_cap_xattr_size() :
+			-posix_cap_xfs_to_xattr(&xfs_cap, xattr_cap, size);
+out:
+	VN_RELE(vp);
+	return -error;
+}
+
+int
+xfs_cap_vremove(
+	vnode_t		*vp)
+{
+	int		error;
+
+	VN_HOLD(vp);
+	error = xfs_cap_allow_set(vp);
+	if (!error) {
+		VOP_ATTR_REMOVE(vp, SGI_CAP_LINUX, ATTR_ROOT, sys_cred, error);
+		if (error == ENOATTR)
+			error = 0;	/* 'scool */
+	}
+	VN_RELE(vp);
+	return -error;
+}
+
+int
+xfs_cap_vset(
+	vnode_t			*vp,
+	void			*cap,
+	size_t			size)
+{
+	posix_cap_xattr		*xattr_cap = cap;
+	xfs_cap_set_t		xfs_cap;
+	int			error;
+
+	if (!cap)
+		return -EINVAL;
+
+	error = posix_cap_xattr_to_xfs(xattr_cap, size, &xfs_cap);
+	if (error)
+		return -error;
+
+	VN_HOLD(vp);
+	error = xfs_cap_allow_set(vp);
+	if (error)
+		goto out;
+
+	VOP_ATTR_SET(vp, SGI_CAP_LINUX, (char *)&xfs_cap,
+			sizeof(xfs_cap_set_t), ATTR_ROOT, sys_cred, error);
+out:
+	VN_RELE(vp);
+	return -error;
+}
+
+STATIC int
+xfs_cap_allow_set(
+	vnode_t		*vp)
+{
+	vattr_t		va;
+	int		error;
+
+	if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
+		return EROFS;
+	if ((error = _MAC_VACCESS(vp, NULL, VWRITE)))
+		return error;
+	va.va_mask = AT_UID;
+	VOP_GETATTR(vp, &va, 0, NULL, error);
+	if (error)
+		return error;
+	if (va.va_uid != current->fsuid && !capable(CAP_FOWNER))
+		return EPERM;
+	return error;
+}
diff --git a/fs/xfs/xfs_cap.h b/fs/xfs/xfs_cap.h
new file mode 100644
index 000000000000..aa9bf8817d8a
--- /dev/null
+++ b/fs/xfs/xfs_cap.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_CAP_H__
+#define __XFS_CAP_H__
+
+/*
+ * Capabilities
+ */
+typedef __uint64_t xfs_cap_value_t;
+
+typedef struct xfs_cap_set {
+	xfs_cap_value_t cap_effective;	/* use in capability checks */
+	xfs_cap_value_t cap_permitted;	/* combined with file attrs */
+	xfs_cap_value_t cap_inheritable;/* pass through exec */
+} xfs_cap_set_t;
+
+/* On-disk XFS extended attribute names */
+#define SGI_CAP_FILE	"SGI_CAP_FILE"
+#define SGI_CAP_FILE_SIZE	(sizeof(SGI_CAP_FILE)-1)
+#define SGI_CAP_LINUX	"SGI_CAP_LINUX"
+#define SGI_CAP_LINUX_SIZE	(sizeof(SGI_CAP_LINUX)-1)
+
+/*
+ * For Linux, we take the bitfields directly from capability.h
+ * and no longer attempt to keep this attribute ondisk compatible
+ * with IRIX.  Since this attribute is only set on exectuables,
+ * it just doesn't make much sense to try.  We do use a different
+ * named attribute though, to avoid confusion.
+ */
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_FS_POSIX_CAP
+
+#include <linux/posix_cap_xattr.h>
+
+struct vnode;
+
+extern int xfs_cap_vhascap(struct vnode *);
+extern int xfs_cap_vset(struct vnode *, void *, size_t);
+extern int xfs_cap_vget(struct vnode *, void *, size_t);
+extern int xfs_cap_vremove(struct vnode *vp);
+
+#define _CAP_EXISTS		xfs_cap_vhascap
+
+#else
+#define xfs_cap_vset(v,p,sz)	(-EOPNOTSUPP)
+#define xfs_cap_vget(v,p,sz)	(-EOPNOTSUPP)
+#define xfs_cap_vremove(v)	(-EOPNOTSUPP)
+#define _CAP_EXISTS		(NULL)
+#endif
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_CAP_H__ */
diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h
new file mode 100644
index 000000000000..14195151eeea
--- /dev/null
+++ b/fs/xfs/xfs_clnt.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_CLNT_H__
+#define __XFS_CLNT_H__
+
+/*
+ * XFS arguments structure, constructed from the arguments we
+ * are passed via the mount system call.
+ *
+ * NOTE: The mount system call is handled differently between
+ * Linux and IRIX.  In IRIX we worked work with a binary data
+ * structure coming in across the syscall interface from user
+ * space (the mount userspace knows about each filesystem type
+ * and the set of valid options for it, and converts the users
+ * argument string into a binary structure _before_ making the
+ * system call), and the ABI issues that this implies.
+ *
+ * In Linux, we are passed a comma separated set of options;
+ * ie. a NULL terminated string of characters.	Userspace mount
+ * code does not have any knowledge of mount options expected by
+ * each filesystem type and so each filesystem parses its mount
+ * options in kernel space.
+ *
+ * For the Linux port, we kept this structure pretty much intact
+ * and use it internally (because the existing code groks it).
+ */
+struct xfs_mount_args {
+	int	flags;		/* flags -> see XFSMNT_... macros below */
+	int	logbufs;	/* Number of log buffers, -1 to default */
+	int	logbufsize;	/* Size of log buffers, -1 to default */
+	char	fsname[MAXNAMELEN];	/* data device name */
+	char	rtname[MAXNAMELEN];	/* realtime device filename */
+	char	logname[MAXNAMELEN];	/* journal device filename */
+	char	mtpt[MAXNAMELEN];	/* filesystem mount point */
+	int	sunit;		/* stripe unit (BBs) */
+	int	swidth;		/* stripe width (BBs), multiple of sunit */
+	uchar_t iosizelog;	/* log2 of the preferred I/O size */
+
+	/*  The remainder is for CXFS support.	*/
+	char	**servlist;	/* Table of hosts which may be servers */
+	int	*servlistlen;	/* Table of hostname lengths. */
+	int	slcount;	/* Count of hosts which may be servers. */
+	int	stimeout;	/* Server timeout in milliseconds */
+	int	ctimeout;	/* Client timeout in milliseconds */
+	char	*server;	/* Designated server hostname (for remount). */
+	int	servlen;	/* Length of server hostname (for remount). */
+	int	servcell;	/* Server cell (internal testing only) */
+};
+
+/*
+ * XFS mount option flags
+ */
+#define XFSMNT_CHKLOG		0x00000001	/* check log */
+#define XFSMNT_WSYNC		0x00000002	/* safe mode nfs mount
+						 * compatible */
+#define XFSMNT_INO64		0x00000004	/* move inode numbers up
+						 * past 2^32 */
+#define XFSMNT_UQUOTA		0x00000008	/* user quota accounting */
+#define XFSMNT_PQUOTA		0x00000010	/* IRIX prj quota accounting */
+#define XFSMNT_UQUOTAENF	0x00000020	/* user quota limit
+						 * enforcement */
+#define XFSMNT_PQUOTAENF	0x00000040	/* IRIX project quota limit
+						 * enforcement */
+#define XFSMNT_NOATIME		0x00000100	/* don't modify access
+						 * times on reads */
+#define XFSMNT_NOALIGN		0x00000200	/* don't allocate at
+						 * stripe boundaries*/
+#define XFSMNT_RETERR		0x00000400	/* return error to user */
+#define XFSMNT_NORECOVERY	0x00000800	/* no recovery, implies
+						 * read-only mount */
+#define XFSMNT_SHARED		0x00001000	/* shared XFS mount */
+#define XFSMNT_IOSIZE		0x00002000	/* optimize for I/O size */
+#define XFSMNT_OSYNCISOSYNC	0x00004000	/* o_sync is REALLY o_sync */
+						/* (osyncisdsync is now default) */
+#define XFSMNT_CLNTONLY		0x00008000	/* cxfs mount as client only */
+#define XFSMNT_UNSHARED		0x00010000	/* cxfs filesystem mounted
+						 * unshared */
+#define XFSMNT_CHGCLNTONLY	0x00020000	/* changing client only flag */
+						/* (for remount only) */
+#define XFSMNT_SERVCELL		0x00040000	/* setting server cell */
+						/* (allowed on remount) */
+#define XFSMNT_MAKESERVER	0x00080000	/* become the server (remount */
+						/* only) */
+#define XFSMNT_NOTSERVER	0x00100000	/* give up being the server */
+						/* (remount only) */
+#define XFSMNT_DMAPI		0x00200000	/* enable dmapi/xdsm */
+#define XFSMNT_GQUOTA		0x00400000	/* group quota accounting */
+#define XFSMNT_GQUOTAENF	0x00800000	/* group quota limit
+						 * enforcement */
+#define XFSMNT_NOUUID		0x01000000	/* Ignore fs uuid */
+#define XFSMNT_32BITINODES	0x02000000	/* restrict inodes to 32
+						 * bits of address space */
+#define XFSMNT_IRIXSGID		0x04000000	/* Irix-style sgid inheritance */
+#define XFSMNT_NOLOGFLUSH	0x08000000	/* Don't flush for log blocks */
+
+/* Did we get any args for CXFS to consume? */
+#define XFSARGS_FOR_CXFSARR(ap)		\
+	((ap)->servlist || (ap)->slcount >= 0 || \
+	 (ap)->stimeout >= 0 || (ap)->ctimeout >= 0 || \
+	 (ap)->flags & (XFSMNT_CLNTONLY | XFSMNT_UNSHARED))
+
+#endif	/* __XFS_CLNT_H__ */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
new file mode 100644
index 000000000000..020801f897ae
--- /dev/null
+++ b/fs/xfs/xfs_da_btree.c
@@ -0,0 +1,2579 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+#if defined(XFSDEBUG) && defined(CONFIG_KDB)
+#undef xfs_buftrace
+#define xfs_buftrace(A,B) \
+	printk("    xfs_buftrace : %s (0x%p)\n", A, B); \
+	BUG();
+#endif
+
+/*
+ * xfs_da_btree.c
+ *
+ * Routines to implement directories as Btrees of hashed names.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC int xfs_da_root_split(xfs_da_state_t *state,
+					    xfs_da_state_blk_t *existing_root,
+					    xfs_da_state_blk_t *new_child);
+STATIC int xfs_da_node_split(xfs_da_state_t *state,
+					    xfs_da_state_blk_t *existing_blk,
+					    xfs_da_state_blk_t *split_blk,
+					    xfs_da_state_blk_t *blk_to_add,
+					    int treelevel,
+					    int *result);
+STATIC void xfs_da_node_rebalance(xfs_da_state_t *state,
+					 xfs_da_state_blk_t *node_blk_1,
+					 xfs_da_state_blk_t *node_blk_2);
+STATIC void xfs_da_node_add(xfs_da_state_t *state,
+				   xfs_da_state_blk_t *old_node_blk,
+				   xfs_da_state_blk_t *new_node_blk);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+STATIC int xfs_da_root_join(xfs_da_state_t *state,
+					   xfs_da_state_blk_t *root_blk);
+STATIC int xfs_da_node_toosmall(xfs_da_state_t *state, int *retval);
+STATIC void xfs_da_node_remove(xfs_da_state_t *state,
+					      xfs_da_state_blk_t *drop_blk);
+STATIC void xfs_da_node_unbalance(xfs_da_state_t *state,
+					 xfs_da_state_blk_t *src_node_blk,
+					 xfs_da_state_blk_t *dst_node_blk);
+
+/*
+ * Utility routines.
+ */
+STATIC uint	xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count);
+STATIC int	xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp);
+STATIC xfs_dabuf_t *xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra);
+
+
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+
+/*
+ * Create the initial contents of an intermediate node.
+ */
+int
+xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
+				 xfs_dabuf_t **bpp, int whichfork)
+{
+	xfs_da_intnode_t *node;
+	xfs_dabuf_t *bp;
+	int error;
+	xfs_trans_t *tp;
+
+	tp = args->trans;
+	error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork);
+	if (error)
+		return(error);
+	ASSERT(bp != NULL);
+	node = bp->data;
+	INT_ZERO(node->hdr.info.forw, ARCH_CONVERT);
+	INT_ZERO(node->hdr.info.back, ARCH_CONVERT);
+	INT_SET(node->hdr.info.magic, ARCH_CONVERT, XFS_DA_NODE_MAGIC);
+	INT_ZERO(node->hdr.info.pad, ARCH_CONVERT);
+	INT_ZERO(node->hdr.count, ARCH_CONVERT);
+	INT_SET(node->hdr.level, ARCH_CONVERT, level);
+
+	xfs_da_log_buf(tp, bp,
+		XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+
+	*bpp = bp;
+	return(0);
+}
+
+/*
+ * Split a leaf node, rebalance, then possibly split
+ * intermediate nodes, rebalance, etc.
+ */
+int							/* error */
+xfs_da_split(xfs_da_state_t *state)
+{
+	xfs_da_state_blk_t *oldblk, *newblk, *addblk;
+	xfs_da_intnode_t *node;
+	xfs_dabuf_t *bp;
+	int max, action, error, i;
+
+	/*
+	 * Walk back up the tree splitting/inserting/adjusting as necessary.
+	 * If we need to insert and there isn't room, split the node, then
+	 * decide which fragment to insert the new block from below into.
+	 * Note that we may split the root this way, but we need more fixup.
+	 */
+	max = state->path.active - 1;
+	ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
+	ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
+	       state->path.blk[max].magic == XFS_DIRX_LEAF_MAGIC(state->mp));
+
+	addblk = &state->path.blk[max];		/* initial dummy value */
+	for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
+		oldblk = &state->path.blk[i];
+		newblk = &state->altpath.blk[i];
+
+		/*
+		 * If a leaf node then
+		 *     Allocate a new leaf node, then rebalance across them.
+		 * else if an intermediate node then
+		 *     We split on the last layer, must we split the node?
+		 */
+		switch (oldblk->magic) {
+		case XFS_ATTR_LEAF_MAGIC:
+#ifndef __KERNEL__
+			return(ENOTTY);
+#else
+			error = xfs_attr_leaf_split(state, oldblk, newblk);
+			if ((error != 0) && (error != ENOSPC)) {
+				return(error);	/* GROT: attr is inconsistent */
+			}
+			if (!error) {
+				addblk = newblk;
+				break;
+			}
+			/*
+			 * Entry wouldn't fit, split the leaf again.
+			 */
+			state->extravalid = 1;
+			if (state->inleaf) {
+				state->extraafter = 0;	/* before newblk */
+				error = xfs_attr_leaf_split(state, oldblk,
+							    &state->extrablk);
+			} else {
+				state->extraafter = 1;	/* after newblk */
+				error = xfs_attr_leaf_split(state, newblk,
+							    &state->extrablk);
+			}
+			if (error)
+				return(error);	/* GROT: attr inconsistent */
+			addblk = newblk;
+			break;
+#endif
+		case XFS_DIR_LEAF_MAGIC:
+			ASSERT(XFS_DIR_IS_V1(state->mp));
+			error = xfs_dir_leaf_split(state, oldblk, newblk);
+			if ((error != 0) && (error != ENOSPC)) {
+				return(error);	/* GROT: dir is inconsistent */
+			}
+			if (!error) {
+				addblk = newblk;
+				break;
+			}
+			/*
+			 * Entry wouldn't fit, split the leaf again.
+			 */
+			state->extravalid = 1;
+			if (state->inleaf) {
+				state->extraafter = 0;	/* before newblk */
+				error = xfs_dir_leaf_split(state, oldblk,
+							   &state->extrablk);
+				if (error)
+					return(error);	/* GROT: dir incon. */
+				addblk = newblk;
+			} else {
+				state->extraafter = 1;	/* after newblk */
+				error = xfs_dir_leaf_split(state, newblk,
+							   &state->extrablk);
+				if (error)
+					return(error);	/* GROT: dir incon. */
+				addblk = newblk;
+			}
+			break;
+		case XFS_DIR2_LEAFN_MAGIC:
+			ASSERT(XFS_DIR_IS_V2(state->mp));
+			error = xfs_dir2_leafn_split(state, oldblk, newblk);
+			if (error)
+				return error;
+			addblk = newblk;
+			break;
+		case XFS_DA_NODE_MAGIC:
+			error = xfs_da_node_split(state, oldblk, newblk, addblk,
+							 max - i, &action);
+			xfs_da_buf_done(addblk->bp);
+			addblk->bp = NULL;
+			if (error)
+				return(error);	/* GROT: dir is inconsistent */
+			/*
+			 * Record the newly split block for the next time thru?
+			 */
+			if (action)
+				addblk = newblk;
+			else
+				addblk = NULL;
+			break;
+		}
+
+		/*
+		 * Update the btree to show the new hashval for this child.
+		 */
+		xfs_da_fixhashpath(state, &state->path);
+		/*
+		 * If we won't need this block again, it's getting dropped
+		 * from the active path by the loop control, so we need
+		 * to mark it done now.
+		 */
+		if (i > 0 || !addblk)
+			xfs_da_buf_done(oldblk->bp);
+	}
+	if (!addblk)
+		return(0);
+
+	/*
+	 * Split the root node.
+	 */
+	ASSERT(state->path.active == 0);
+	oldblk = &state->path.blk[0];
+	error = xfs_da_root_split(state, oldblk, addblk);
+	if (error) {
+		xfs_da_buf_done(oldblk->bp);
+		xfs_da_buf_done(addblk->bp);
+		addblk->bp = NULL;
+		return(error);	/* GROT: dir is inconsistent */
+	}
+
+	/*
+	 * Update pointers to the node which used to be block 0 and
+	 * just got bumped because of the addition of a new root node.
+	 * There might be three blocks involved if a double split occurred,
+	 * and the original block 0 could be at any position in the list.
+	 */
+
+	node = oldblk->bp->data;
+	if (!INT_ISZERO(node->hdr.info.forw, ARCH_CONVERT)) {
+		if (INT_GET(node->hdr.info.forw, ARCH_CONVERT) == addblk->blkno) {
+			bp = addblk->bp;
+		} else {
+			ASSERT(state->extravalid);
+			bp = state->extrablk.bp;
+		}
+		node = bp->data;
+		INT_SET(node->hdr.info.back, ARCH_CONVERT, oldblk->blkno);
+		xfs_da_log_buf(state->args->trans, bp,
+		    XFS_DA_LOGRANGE(node, &node->hdr.info,
+		    sizeof(node->hdr.info)));
+	}
+	node = oldblk->bp->data;
+	if (INT_GET(node->hdr.info.back, ARCH_CONVERT)) {
+		if (INT_GET(node->hdr.info.back, ARCH_CONVERT) == addblk->blkno) {
+			bp = addblk->bp;
+		} else {
+			ASSERT(state->extravalid);
+			bp = state->extrablk.bp;
+		}
+		node = bp->data;
+		INT_SET(node->hdr.info.forw, ARCH_CONVERT, oldblk->blkno);
+		xfs_da_log_buf(state->args->trans, bp,
+		    XFS_DA_LOGRANGE(node, &node->hdr.info,
+		    sizeof(node->hdr.info)));
+	}
+	xfs_da_buf_done(oldblk->bp);
+	xfs_da_buf_done(addblk->bp);
+	addblk->bp = NULL;
+	return(0);
+}
+
+/*
+ * Split the root.  We have to create a new root and point to the two
+ * parts (the split old root) that we just created.  Copy block zero to
+ * the EOF, extending the inode in process.
+ */
+STATIC int						/* error */
+xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
+				 xfs_da_state_blk_t *blk2)
+{
+	xfs_da_intnode_t *node, *oldroot;
+	xfs_da_args_t *args;
+	xfs_dablk_t blkno;
+	xfs_dabuf_t *bp;
+	int error, size;
+	xfs_inode_t *dp;
+	xfs_trans_t *tp;
+	xfs_mount_t *mp;
+	xfs_dir2_leaf_t *leaf;
+
+	/*
+	 * Copy the existing (incorrect) block from the root node position
+	 * to a free space somewhere.
+	 */
+	args = state->args;
+	ASSERT(args != NULL);
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error)
+		return(error);
+	dp = args->dp;
+	tp = args->trans;
+	mp = state->mp;
+	error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
+	if (error)
+		return(error);
+	ASSERT(bp != NULL);
+	node = bp->data;
+	oldroot = blk1->bp->data;
+	if (INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) {
+		size = (int)((char *)&oldroot->btree[INT_GET(oldroot->hdr.count, ARCH_CONVERT)] -
+			     (char *)oldroot);
+	} else {
+		ASSERT(XFS_DIR_IS_V2(mp));
+		ASSERT(INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+		leaf = (xfs_dir2_leaf_t *)oldroot;
+		size = (int)((char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] -
+			     (char *)leaf);
+	}
+	bcopy(oldroot, node, size);
+	xfs_da_log_buf(tp, bp, 0, size - 1);
+	xfs_da_buf_done(blk1->bp);
+	blk1->bp = bp;
+	blk1->blkno = blkno;
+
+	/*
+	 * Set up the new root node.
+	 */
+	error = xfs_da_node_create(args,
+		args->whichfork == XFS_DATA_FORK &&
+		XFS_DIR_IS_V2(mp) ? mp->m_dirleafblk : 0,
+		INT_GET(node->hdr.level, ARCH_CONVERT) + 1, &bp, args->whichfork);
+	if (error)
+		return(error);
+	node = bp->data;
+	INT_SET(node->btree[0].hashval, ARCH_CONVERT, blk1->hashval);
+	INT_SET(node->btree[0].before, ARCH_CONVERT, blk1->blkno);
+	INT_SET(node->btree[1].hashval, ARCH_CONVERT, blk2->hashval);
+	INT_SET(node->btree[1].before, ARCH_CONVERT, blk2->blkno);
+	INT_SET(node->hdr.count, ARCH_CONVERT, 2);
+	if (XFS_DIR_IS_V2(mp)) {
+		ASSERT(blk1->blkno >= mp->m_dirleafblk &&
+		       blk1->blkno < mp->m_dirfreeblk);
+		ASSERT(blk2->blkno >= mp->m_dirleafblk &&
+		       blk2->blkno < mp->m_dirfreeblk);
+	}
+	/* Header is already logged by xfs_da_node_create */
+	xfs_da_log_buf(tp, bp,
+		XFS_DA_LOGRANGE(node, node->btree,
+			sizeof(xfs_da_node_entry_t) * 2));
+	xfs_da_buf_done(bp);
+
+	return(0);
+}
+
+/*
+ * Split the node, rebalance, then add the new entry.
+ */
+STATIC int						/* error */
+xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
+				 xfs_da_state_blk_t *newblk,
+				 xfs_da_state_blk_t *addblk,
+				 int treelevel, int *result)
+{
+	xfs_da_intnode_t *node;
+	xfs_dablk_t blkno;
+	int newcount, error;
+	int useextra;
+
+	node = oldblk->bp->data;
+	ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+
+	/*
+	 * With V2 the extra block is data or freespace.
+	 */
+	useextra = state->extravalid && XFS_DIR_IS_V1(state->mp);
+	newcount = 1 + useextra;
+	/*
+	 * Do we have to split the node?
+	 */
+	if ((INT_GET(node->hdr.count, ARCH_CONVERT) + newcount) > XFS_DA_NODE_ENTRIES(state->mp)) {
+		/*
+		 * Allocate a new node, add to the doubly linked chain of
+		 * nodes, then move some of our excess entries into it.
+		 */
+		error = xfs_da_grow_inode(state->args, &blkno);
+		if (error)
+			return(error);	/* GROT: dir is inconsistent */
+
+		error = xfs_da_node_create(state->args, blkno, treelevel,
+					   &newblk->bp, state->args->whichfork);
+		if (error)
+			return(error);	/* GROT: dir is inconsistent */
+		newblk->blkno = blkno;
+		newblk->magic = XFS_DA_NODE_MAGIC;
+		xfs_da_node_rebalance(state, oldblk, newblk);
+		error = xfs_da_blk_link(state, oldblk, newblk);
+		if (error)
+			return(error);
+		*result = 1;
+	} else {
+		*result = 0;
+	}
+
+	/*
+	 * Insert the new entry(s) into the correct block
+	 * (updating last hashval in the process).
+	 *
+	 * xfs_da_node_add() inserts BEFORE the given index,
+	 * and as a result of using node_lookup_int() we always
+	 * point to a valid entry (not after one), but a split
+	 * operation always results in a new block whose hashvals
+	 * FOLLOW the current block.
+	 *
+	 * If we had double-split op below us, then add the extra block too.
+	 */
+	node = oldblk->bp->data;
+	if (oldblk->index <= INT_GET(node->hdr.count, ARCH_CONVERT)) {
+		oldblk->index++;
+		xfs_da_node_add(state, oldblk, addblk);
+		if (useextra) {
+			if (state->extraafter)
+				oldblk->index++;
+			xfs_da_node_add(state, oldblk, &state->extrablk);
+			state->extravalid = 0;
+		}
+	} else {
+		newblk->index++;
+		xfs_da_node_add(state, newblk, addblk);
+		if (useextra) {
+			if (state->extraafter)
+				newblk->index++;
+			xfs_da_node_add(state, newblk, &state->extrablk);
+			state->extravalid = 0;
+		}
+	}
+
+	return(0);
+}
+
+/*
+ * Balance the btree elements between two intermediate nodes,
+ * usually one full and one empty.
+ *
+ * NOTE: if blk2 is empty, then it will get the upper half of blk1.
+ */
+STATIC void
+xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
+				     xfs_da_state_blk_t *blk2)
+{
+	xfs_da_intnode_t *node1, *node2, *tmpnode;
+	xfs_da_node_entry_t *btree_s, *btree_d;
+	int count, tmp;
+	xfs_trans_t *tp;
+
+	node1 = blk1->bp->data;
+	node2 = blk2->bp->data;
+	/*
+	 * Figure out how many entries need to move, and in which direction.
+	 * Swap the nodes around if that makes it simpler.
+	 */
+	if ((INT_GET(node1->hdr.count, ARCH_CONVERT) > 0) && (INT_GET(node2->hdr.count, ARCH_CONVERT) > 0) &&
+	    ((INT_GET(node2->btree[ 0 ].hashval, ARCH_CONVERT) < INT_GET(node1->btree[ 0 ].hashval, ARCH_CONVERT)) ||
+	     (INT_GET(node2->btree[ INT_GET(node2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
+	      INT_GET(node1->btree[ INT_GET(node1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) {
+		tmpnode = node1;
+		node1 = node2;
+		node2 = tmpnode;
+	}
+	ASSERT(INT_GET(node1->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+	ASSERT(INT_GET(node2->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+	count = (INT_GET(node1->hdr.count, ARCH_CONVERT) - INT_GET(node2->hdr.count, ARCH_CONVERT)) / 2;
+	if (count == 0)
+		return;
+	tp = state->args->trans;
+	/*
+	 * Two cases: high-to-low and low-to-high.
+	 */
+	if (count > 0) {
+		/*
+		 * Move elements in node2 up to make a hole.
+		 */
+		if ((tmp = INT_GET(node2->hdr.count, ARCH_CONVERT)) > 0) {
+			tmp *= (uint)sizeof(xfs_da_node_entry_t);
+			btree_s = &node2->btree[0];
+			btree_d = &node2->btree[count];
+			ovbcopy(btree_s, btree_d, tmp);
+		}
+
+		/*
+		 * Move the req'd B-tree elements from high in node1 to
+		 * low in node2.
+		 */
+		INT_MOD(node2->hdr.count, ARCH_CONVERT, count);
+		tmp = count * (uint)sizeof(xfs_da_node_entry_t);
+		btree_s = &node1->btree[INT_GET(node1->hdr.count, ARCH_CONVERT) - count];
+		btree_d = &node2->btree[0];
+		bcopy(btree_s, btree_d, tmp);
+		INT_MOD(node1->hdr.count, ARCH_CONVERT, -(count));
+
+	} else {
+		/*
+		 * Move the req'd B-tree elements from low in node2 to
+		 * high in node1.
+		 */
+		count = -count;
+		tmp = count * (uint)sizeof(xfs_da_node_entry_t);
+		btree_s = &node2->btree[0];
+		btree_d = &node1->btree[INT_GET(node1->hdr.count, ARCH_CONVERT)];
+		bcopy(btree_s, btree_d, tmp);
+		INT_MOD(node1->hdr.count, ARCH_CONVERT, count);
+		xfs_da_log_buf(tp, blk1->bp,
+			XFS_DA_LOGRANGE(node1, btree_d, tmp));
+
+		/*
+		 * Move elements in node2 down to fill the hole.
+		 */
+		tmp  = INT_GET(node2->hdr.count, ARCH_CONVERT) - count;
+		tmp *= (uint)sizeof(xfs_da_node_entry_t);
+		btree_s = &node2->btree[count];
+		btree_d = &node2->btree[0];
+		ovbcopy(btree_s, btree_d, tmp);
+		INT_MOD(node2->hdr.count, ARCH_CONVERT, -(count));
+	}
+
+	/*
+	 * Log header of node 1 and all current bits of node 2.
+	 */
+	xfs_da_log_buf(tp, blk1->bp,
+		XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr)));
+	xfs_da_log_buf(tp, blk2->bp,
+		XFS_DA_LOGRANGE(node2, &node2->hdr,
+			sizeof(node2->hdr) +
+			sizeof(node2->btree[0]) * INT_GET(node2->hdr.count, ARCH_CONVERT)));
+
+	/*
+	 * Record the last hashval from each block for upward propagation.
+	 * (note: don't use the swapped node pointers)
+	 */
+	node1 = blk1->bp->data;
+	node2 = blk2->bp->data;
+	blk1->hashval = INT_GET(node1->btree[ INT_GET(node1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+	blk2->hashval = INT_GET(node2->btree[ INT_GET(node2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+
+	/*
+	 * Adjust the expected index for insertion.
+	 */
+	if (blk1->index >= INT_GET(node1->hdr.count, ARCH_CONVERT)) {
+		blk2->index = blk1->index - INT_GET(node1->hdr.count, ARCH_CONVERT);
+		blk1->index = INT_GET(node1->hdr.count, ARCH_CONVERT) + 1;	/* make it invalid */
+	}
+}
+
+/*
+ * Add a new entry to an intermediate node.
+ */
+STATIC void
+xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
+			       xfs_da_state_blk_t *newblk)
+{
+	xfs_da_intnode_t *node;
+	xfs_da_node_entry_t *btree;
+	int tmp;
+	xfs_mount_t *mp;
+
+	node = oldblk->bp->data;
+	mp = state->mp;
+	ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+	ASSERT((oldblk->index >= 0) && (oldblk->index <= INT_GET(node->hdr.count, ARCH_CONVERT)));
+	ASSERT(newblk->blkno != 0);
+	if (state->args->whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
+		ASSERT(newblk->blkno >= mp->m_dirleafblk &&
+		       newblk->blkno < mp->m_dirfreeblk);
+
+	/*
+	 * We may need to make some room before we insert the new node.
+	 */
+	tmp = 0;
+	btree = &node->btree[ oldblk->index ];
+	if (oldblk->index < INT_GET(node->hdr.count, ARCH_CONVERT)) {
+		tmp = (INT_GET(node->hdr.count, ARCH_CONVERT) - oldblk->index) * (uint)sizeof(*btree);
+		ovbcopy(btree, btree + 1, tmp);
+	}
+	INT_SET(btree->hashval, ARCH_CONVERT, newblk->hashval);
+	INT_SET(btree->before, ARCH_CONVERT, newblk->blkno);
+	xfs_da_log_buf(state->args->trans, oldblk->bp,
+		XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree)));
+	INT_MOD(node->hdr.count, ARCH_CONVERT, +1);
+	xfs_da_log_buf(state->args->trans, oldblk->bp,
+		XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+
+	/*
+	 * Copy the last hash value from the oldblk to propagate upwards.
+	 */
+	oldblk->hashval = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+}
+
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+
+/*
+ * Deallocate an empty leaf node, remove it from its parent,
+ * possibly deallocating that block, etc...
+ */
+int
+xfs_da_join(xfs_da_state_t *state)
+{
+	xfs_da_state_blk_t *drop_blk, *save_blk;
+	int action, error;
+
+	action = 0;
+	drop_blk = &state->path.blk[ state->path.active-1 ];
+	save_blk = &state->altpath.blk[ state->path.active-1 ];
+	ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
+	ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
+	       drop_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp));
+
+	/*
+	 * Walk back up the tree joining/deallocating as necessary.
+	 * When we stop dropping blocks, break out.
+	 */
+	for (  ; state->path.active >= 2; drop_blk--, save_blk--,
+		 state->path.active--) {
+		/*
+		 * See if we can combine the block with a neighbor.
+		 *   (action == 0) => no options, just leave
+		 *   (action == 1) => coalesce, then unlink
+		 *   (action == 2) => block empty, unlink it
+		 */
+		switch (drop_blk->magic) {
+		case XFS_ATTR_LEAF_MAGIC:
+#ifndef __KERNEL__
+			error = ENOTTY;
+#else
+			error = xfs_attr_leaf_toosmall(state, &action);
+#endif
+			if (error)
+				return(error);
+			if (action == 0)
+				return(0);
+#ifdef __KERNEL__
+			xfs_attr_leaf_unbalance(state, drop_blk, save_blk);
+#endif
+			break;
+		case XFS_DIR_LEAF_MAGIC:
+			ASSERT(XFS_DIR_IS_V1(state->mp));
+			error = xfs_dir_leaf_toosmall(state, &action);
+			if (error)
+				return(error);
+			if (action == 0)
+				return(0);
+			xfs_dir_leaf_unbalance(state, drop_blk, save_blk);
+			break;
+		case XFS_DIR2_LEAFN_MAGIC:
+			ASSERT(XFS_DIR_IS_V2(state->mp));
+			error = xfs_dir2_leafn_toosmall(state, &action);
+			if (error)
+				return error;
+			if (action == 0)
+				return 0;
+			xfs_dir2_leafn_unbalance(state, drop_blk, save_blk);
+			break;
+		case XFS_DA_NODE_MAGIC:
+			/*
+			 * Remove the offending node, fixup hashvals,
+			 * check for a toosmall neighbor.
+			 */
+			xfs_da_node_remove(state, drop_blk);
+			xfs_da_fixhashpath(state, &state->path);
+			error = xfs_da_node_toosmall(state, &action);
+			if (error)
+				return(error);
+			if (action == 0)
+				return 0;
+			xfs_da_node_unbalance(state, drop_blk, save_blk);
+			break;
+		}
+		xfs_da_fixhashpath(state, &state->altpath);
+		error = xfs_da_blk_unlink(state, drop_blk, save_blk);
+		xfs_da_state_kill_altpath(state);
+		if (error)
+			return(error);
+		error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
+							 drop_blk->bp);
+		drop_blk->bp = NULL;
+		if (error)
+			return(error);
+	}
+	/*
+	 * We joined all the way to the top.  If it turns out that
+	 * we only have one entry in the root, make the child block
+	 * the new root.
+	 */
+	xfs_da_node_remove(state, drop_blk);
+	xfs_da_fixhashpath(state, &state->path);
+	error = xfs_da_root_join(state, &state->path.blk[0]);
+	return(error);
+}
+
+/*
+ * We have only one entry in the root.	Copy the only remaining child of
+ * the old root to block 0 as the new root node.
+ */
+STATIC int
+xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
+{
+	xfs_da_intnode_t *oldroot;
+	/* REFERENCED */
+	xfs_da_blkinfo_t *blkinfo;
+	xfs_da_args_t *args;
+	xfs_dablk_t child;
+	xfs_dabuf_t *bp;
+	int error;
+
+	args = state->args;
+	ASSERT(args != NULL);
+	ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
+	oldroot = root_blk->bp->data;
+	ASSERT(INT_GET(oldroot->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+	ASSERT(INT_ISZERO(oldroot->hdr.info.forw, ARCH_CONVERT));
+	ASSERT(INT_ISZERO(oldroot->hdr.info.back, ARCH_CONVERT));
+
+	/*
+	 * If the root has more than one child, then don't do anything.
+	 */
+	if (INT_GET(oldroot->hdr.count, ARCH_CONVERT) > 1)
+		return(0);
+
+	/*
+	 * Read in the (only) child block, then copy those bytes into
+	 * the root block's buffer and free the original child block.
+	 */
+	child = INT_GET(oldroot->btree[ 0 ].before, ARCH_CONVERT);
+	ASSERT(child != 0);
+	error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp,
+					     args->whichfork);
+	if (error)
+		return(error);
+	ASSERT(bp != NULL);
+	blkinfo = bp->data;
+	if (INT_GET(oldroot->hdr.level, ARCH_CONVERT) == 1) {
+		ASSERT(INT_GET(blkinfo->magic, ARCH_CONVERT) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+		       INT_GET(blkinfo->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC);
+	} else {
+		ASSERT(INT_GET(blkinfo->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+	}
+	ASSERT(INT_ISZERO(blkinfo->forw, ARCH_CONVERT));
+	ASSERT(INT_ISZERO(blkinfo->back, ARCH_CONVERT));
+	bcopy(bp->data, root_blk->bp->data, state->blocksize);
+	xfs_da_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
+	error = xfs_da_shrink_inode(args, child, bp);
+	return(error);
+}
+
+/*
+ * Check a node block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+STATIC int
+xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
+{
+	xfs_da_intnode_t *node;
+	xfs_da_state_blk_t *blk;
+	xfs_da_blkinfo_t *info;
+	int count, forward, error, retval, i;
+	xfs_dablk_t blkno;
+	xfs_dabuf_t *bp;
+
+	/*
+	 * Check for the degenerate case of the block being over 50% full.
+	 * If so, it's not worth even looking to see if we might be able
+	 * to coalesce with a sibling.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	info = blk->bp->data;
+	ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+	node = (xfs_da_intnode_t *)info;
+	count = INT_GET(node->hdr.count, ARCH_CONVERT);
+	if (count > (XFS_DA_NODE_ENTRIES(state->mp) >> 1)) {
+		*action = 0;	/* blk over 50%, dont try to join */
+		return(0);	/* blk over 50%, dont try to join */
+	}
+
+	/*
+	 * Check for the degenerate case of the block being empty.
+	 * If the block is empty, we'll simply delete it, no need to
+	 * coalesce it with a sibling block.  We choose (aribtrarily)
+	 * to merge with the forward block unless it is NULL.
+	 */
+	if (count == 0) {
+		/*
+		 * Make altpath point to the block we want to keep and
+		 * path point to the block we want to drop (this one).
+		 */
+		forward = (!INT_ISZERO(info->forw, ARCH_CONVERT));
+		bcopy(&state->path, &state->altpath, sizeof(state->path));
+		error = xfs_da_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+		if (error)
+			return(error);
+		if (retval) {
+			*action = 0;
+		} else {
+			*action = 2;
+		}
+		return(0);
+	}
+
+	/*
+	 * Examine each sibling block to see if we can coalesce with
+	 * at least 25% free space to spare.  We need to figure out
+	 * whether to merge with the forward or the backward block.
+	 * We prefer coalescing with the lower numbered sibling so as
+	 * to shrink a directory over time.
+	 */
+	/* start with smaller blk num */
+	forward = (INT_GET(info->forw, ARCH_CONVERT)
+				< INT_GET(info->back, ARCH_CONVERT));
+	for (i = 0; i < 2; forward = !forward, i++) {
+		if (forward)
+			blkno = INT_GET(info->forw, ARCH_CONVERT);
+		else
+			blkno = INT_GET(info->back, ARCH_CONVERT);
+		if (blkno == 0)
+			continue;
+		error = xfs_da_read_buf(state->args->trans, state->args->dp,
+					blkno, -1, &bp, state->args->whichfork);
+		if (error)
+			return(error);
+		ASSERT(bp != NULL);
+
+		node = (xfs_da_intnode_t *)info;
+		count  = XFS_DA_NODE_ENTRIES(state->mp);
+		count -= XFS_DA_NODE_ENTRIES(state->mp) >> 2;
+		count -= INT_GET(node->hdr.count, ARCH_CONVERT);
+		node = bp->data;
+		ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+		count -= INT_GET(node->hdr.count, ARCH_CONVERT);
+		xfs_da_brelse(state->args->trans, bp);
+		if (count >= 0)
+			break;	/* fits with at least 25% to spare */
+	}
+	if (i >= 2) {
+		*action = 0;
+		return(0);
+	}
+
+	/*
+	 * Make altpath point to the block we want to keep (the lower
+	 * numbered block) and path point to the block we want to drop.
+	 */
+	bcopy(&state->path, &state->altpath, sizeof(state->path));
+	if (blkno < blk->blkno) {
+		error = xfs_da_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+		if (error) {
+			return(error);
+		}
+		if (retval) {
+			*action = 0;
+			return(0);
+		}
+	} else {
+		error = xfs_da_path_shift(state, &state->path, forward,
+						 0, &retval);
+		if (error) {
+			return(error);
+		}
+		if (retval) {
+			*action = 0;
+			return(0);
+		}
+	}
+	*action = 1;
+	return(0);
+}
+
+/*
+ * Walk back up the tree adjusting hash values as necessary,
+ * when we stop making changes, return.
+ */
+void
+xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
+{
+	xfs_da_state_blk_t *blk;
+	xfs_da_intnode_t *node;
+	xfs_da_node_entry_t *btree;
+	xfs_dahash_t lasthash=0;
+	int level, count;
+
+	level = path->active-1;
+	blk = &path->blk[ level ];
+	switch (blk->magic) {
+#ifdef __KERNEL__
+	case XFS_ATTR_LEAF_MAGIC:
+		lasthash = xfs_attr_leaf_lasthash(blk->bp, &count);
+		if (count == 0)
+			return;
+		break;
+#endif
+	case XFS_DIR_LEAF_MAGIC:
+		ASSERT(XFS_DIR_IS_V1(state->mp));
+		lasthash = xfs_dir_leaf_lasthash(blk->bp, &count);
+		if (count == 0)
+			return;
+		break;
+	case XFS_DIR2_LEAFN_MAGIC:
+		ASSERT(XFS_DIR_IS_V2(state->mp));
+		lasthash = xfs_dir2_leafn_lasthash(blk->bp, &count);
+		if (count == 0)
+			return;
+		break;
+	case XFS_DA_NODE_MAGIC:
+		lasthash = xfs_da_node_lasthash(blk->bp, &count);
+		if (count == 0)
+			return;
+		break;
+	}
+	for (blk--, level--; level >= 0; blk--, level--) {
+		node = blk->bp->data;
+		ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+		btree = &node->btree[ blk->index ];
+		if (INT_GET(btree->hashval, ARCH_CONVERT) == lasthash)
+			break;
+		blk->hashval = lasthash;
+		INT_SET(btree->hashval, ARCH_CONVERT, lasthash);
+		xfs_da_log_buf(state->args->trans, blk->bp,
+				  XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
+
+		lasthash = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+	}
+}
+
+/*
+ * Remove an entry from an intermediate node.
+ */
+STATIC void
+xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk)
+{
+	xfs_da_intnode_t *node;
+	xfs_da_node_entry_t *btree;
+	int tmp;
+
+	node = drop_blk->bp->data;
+	ASSERT(drop_blk->index < INT_GET(node->hdr.count, ARCH_CONVERT));
+	ASSERT(drop_blk->index >= 0);
+
+	/*
+	 * Copy over the offending entry, or just zero it out.
+	 */
+	btree = &node->btree[drop_blk->index];
+	if (drop_blk->index < (INT_GET(node->hdr.count, ARCH_CONVERT)-1)) {
+		tmp  = INT_GET(node->hdr.count, ARCH_CONVERT) - drop_blk->index - 1;
+		tmp *= (uint)sizeof(xfs_da_node_entry_t);
+		ovbcopy(btree + 1, btree, tmp);
+		xfs_da_log_buf(state->args->trans, drop_blk->bp,
+		    XFS_DA_LOGRANGE(node, btree, tmp));
+		btree = &node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ];
+	}
+	bzero((char *)btree, sizeof(xfs_da_node_entry_t));
+	xfs_da_log_buf(state->args->trans, drop_blk->bp,
+	    XFS_DA_LOGRANGE(node, btree, sizeof(*btree)));
+	INT_MOD(node->hdr.count, ARCH_CONVERT, -1);
+	xfs_da_log_buf(state->args->trans, drop_blk->bp,
+	    XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+
+	/*
+	 * Copy the last hash value from the block to propagate upwards.
+	 */
+	btree--;
+	drop_blk->hashval = INT_GET(btree->hashval, ARCH_CONVERT);
+}
+
+/*
+ * Unbalance the btree elements between two intermediate nodes,
+ * move all Btree elements from one node into another.
+ */
+STATIC void
+xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
+				     xfs_da_state_blk_t *save_blk)
+{
+	xfs_da_intnode_t *drop_node, *save_node;
+	xfs_da_node_entry_t *btree;
+	int tmp;
+	xfs_trans_t *tp;
+
+	drop_node = drop_blk->bp->data;
+	save_node = save_blk->bp->data;
+	ASSERT(INT_GET(drop_node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+	ASSERT(INT_GET(save_node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+	tp = state->args->trans;
+
+	/*
+	 * If the dying block has lower hashvals, then move all the
+	 * elements in the remaining block up to make a hole.
+	 */
+	if ((INT_GET(drop_node->btree[ 0 ].hashval, ARCH_CONVERT) < INT_GET(save_node->btree[ 0 ].hashval, ARCH_CONVERT)) ||
+	    (INT_GET(drop_node->btree[ INT_GET(drop_node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
+	     INT_GET(save_node->btree[ INT_GET(save_node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))
+	{
+		btree = &save_node->btree[ INT_GET(drop_node->hdr.count, ARCH_CONVERT) ];
+		tmp = INT_GET(save_node->hdr.count, ARCH_CONVERT) * (uint)sizeof(xfs_da_node_entry_t);
+		ovbcopy(&save_node->btree[0], btree, tmp);
+		btree = &save_node->btree[0];
+		xfs_da_log_buf(tp, save_blk->bp,
+			XFS_DA_LOGRANGE(save_node, btree,
+				(INT_GET(save_node->hdr.count, ARCH_CONVERT) + INT_GET(drop_node->hdr.count, ARCH_CONVERT)) *
+				sizeof(xfs_da_node_entry_t)));
+	} else {
+		btree = &save_node->btree[ INT_GET(save_node->hdr.count, ARCH_CONVERT) ];
+		xfs_da_log_buf(tp, save_blk->bp,
+			XFS_DA_LOGRANGE(save_node, btree,
+				INT_GET(drop_node->hdr.count, ARCH_CONVERT) *
+				sizeof(xfs_da_node_entry_t)));
+	}
+
+	/*
+	 * Move all the B-tree elements from drop_blk to save_blk.
+	 */
+	tmp = INT_GET(drop_node->hdr.count, ARCH_CONVERT) * (uint)sizeof(xfs_da_node_entry_t);
+	bcopy(&drop_node->btree[0], btree, tmp);
+	INT_MOD(save_node->hdr.count, ARCH_CONVERT, INT_GET(drop_node->hdr.count, ARCH_CONVERT));
+
+	xfs_da_log_buf(tp, save_blk->bp,
+		XFS_DA_LOGRANGE(save_node, &save_node->hdr,
+			sizeof(save_node->hdr)));
+
+	/*
+	 * Save the last hashval in the remaining block for upward propagation.
+	 */
+	save_blk->hashval = INT_GET(save_node->btree[ INT_GET(save_node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+}
+
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+
+/*
+ * Walk down the Btree looking for a particular filename, filling
+ * in the state structure as we go.
+ *
+ * We will set the state structure to point to each of the elements
+ * in each of the nodes where either the hashval is or should be.
+ *
+ * We support duplicate hashval's so for each entry in the current
+ * node that could contain the desired hashval, descend.  This is a
+ * pruned depth-first tree search.
+ */
+int							/* error */
+xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
+{
+	xfs_da_state_blk_t *blk;
+	xfs_da_blkinfo_t *curr;
+	xfs_da_intnode_t *node;
+	xfs_da_node_entry_t *btree;
+	xfs_dablk_t blkno;
+	int probe, span, max, error, retval;
+	xfs_dahash_t hashval;
+	xfs_da_args_t *args;
+
+	args = state->args;
+	/*
+	 * Descend thru the B-tree searching each level for the right
+	 * node to use, until the right hashval is found.
+	 */
+	if (args->whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(state->mp))
+		blkno = state->mp->m_dirleafblk;
+	else
+		blkno = 0;
+	for (blk = &state->path.blk[0], state->path.active = 1;
+			 state->path.active <= XFS_DA_NODE_MAXDEPTH;
+			 blk++, state->path.active++) {
+		/*
+		 * Read the next node down in the tree.
+		 */
+		blk->blkno = blkno;
+		error = xfs_da_read_buf(state->args->trans, state->args->dp,
+					blkno, -1, &blk->bp,
+					state->args->whichfork);
+		if (error) {
+			blk->blkno = 0;
+			state->path.active--;
+			return(error);
+		}
+		ASSERT(blk->bp != NULL);
+		curr = blk->bp->data;
+		ASSERT(INT_GET(curr->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC ||
+		       INT_GET(curr->magic, ARCH_CONVERT) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+		       INT_GET(curr->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC);
+
+		/*
+		 * Search an intermediate node for a match.
+		 */
+		blk->magic = INT_GET(curr->magic, ARCH_CONVERT);
+		if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) {
+			node = blk->bp->data;
+			blk->hashval = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+
+			/*
+			 * Binary search.  (note: small blocks will skip loop)
+			 */
+			max = INT_GET(node->hdr.count, ARCH_CONVERT);
+			probe = span = max / 2;
+			hashval = state->args->hashval;
+			for (btree = &node->btree[probe]; span > 4;
+				   btree = &node->btree[probe]) {
+				span /= 2;
+				if (INT_GET(btree->hashval, ARCH_CONVERT) < hashval)
+					probe += span;
+				else if (INT_GET(btree->hashval, ARCH_CONVERT) > hashval)
+					probe -= span;
+				else
+					break;
+			}
+			ASSERT((probe >= 0) && (probe < max));
+			ASSERT((span <= 4) || (INT_GET(btree->hashval, ARCH_CONVERT) == hashval));
+
+			/*
+			 * Since we may have duplicate hashval's, find the first
+			 * matching hashval in the node.
+			 */
+			while ((probe > 0) && (INT_GET(btree->hashval, ARCH_CONVERT) >= hashval)) {
+				btree--;
+				probe--;
+			}
+			while ((probe < max) && (INT_GET(btree->hashval, ARCH_CONVERT) < hashval)) {
+				btree++;
+				probe++;
+			}
+
+			/*
+			 * Pick the right block to descend on.
+			 */
+			if (probe == max) {
+				blk->index = max-1;
+				blkno = INT_GET(node->btree[ max-1 ].before, ARCH_CONVERT);
+			} else {
+				blk->index = probe;
+				blkno = INT_GET(btree->before, ARCH_CONVERT);
+			}
+		}
+#ifdef __KERNEL__
+		else if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC) {
+			blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
+			break;
+		}
+#endif
+		else if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC) {
+			blk->hashval = xfs_dir_leaf_lasthash(blk->bp, NULL);
+			break;
+		}
+		else if (INT_GET(curr->magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC) {
+			blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL);
+			break;
+		}
+	}
+
+	/*
+	 * A leaf block that ends in the hashval that we are interested in
+	 * (final hashval == search hashval) means that the next block may
+	 * contain more entries with the same hashval, shift upward to the
+	 * next leaf and keep searching.
+	 */
+	for (;;) {
+		if (blk->magic == XFS_DIR_LEAF_MAGIC) {
+			ASSERT(XFS_DIR_IS_V1(state->mp));
+			retval = xfs_dir_leaf_lookup_int(blk->bp, state->args,
+								  &blk->index);
+		} else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
+			ASSERT(XFS_DIR_IS_V2(state->mp));
+			retval = xfs_dir2_leafn_lookup_int(blk->bp, state->args,
+							&blk->index, state);
+		}
+#ifdef __KERNEL__
+		else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+			retval = xfs_attr_leaf_lookup_int(blk->bp, state->args);
+			blk->index = state->args->index;
+			state->args->blkno = blk->blkno;
+		}
+#endif
+		if (((retval == ENOENT) || (retval == ENOATTR)) &&
+		    (blk->hashval == state->args->hashval)) {
+			error = xfs_da_path_shift(state, &state->path, 1, 1,
+							 &retval);
+			if (error)
+				return(error);
+			if (retval == 0) {
+				continue;
+			}
+#ifdef __KERNEL__
+			else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+				/* path_shift() gives ENOENT */
+				retval = XFS_ERROR(ENOATTR);
+			}
+#endif
+		}
+		break;
+	}
+	*result = retval;
+	return(0);
+}
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Link a new block into a doubly linked list of blocks (of whatever type).
+ */
+int							/* error */
+xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
+			       xfs_da_state_blk_t *new_blk)
+{
+	xfs_da_blkinfo_t *old_info, *new_info, *tmp_info;
+	xfs_da_args_t *args;
+	int before=0, error;
+	xfs_dabuf_t *bp;
+
+	/*
+	 * Set up environment.
+	 */
+	args = state->args;
+	ASSERT(args != NULL);
+	old_info = old_blk->bp->data;
+	new_info = new_blk->bp->data;
+	ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
+	       old_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+	       old_blk->magic == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(old_blk->magic == INT_GET(old_info->magic, ARCH_CONVERT));
+	ASSERT(new_blk->magic == INT_GET(new_info->magic, ARCH_CONVERT));
+	ASSERT(old_blk->magic == new_blk->magic);
+
+	switch (old_blk->magic) {
+#ifdef __KERNEL__
+	case XFS_ATTR_LEAF_MAGIC:
+		before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
+		break;
+#endif
+	case XFS_DIR_LEAF_MAGIC:
+		ASSERT(XFS_DIR_IS_V1(state->mp));
+		before = xfs_dir_leaf_order(old_blk->bp, new_blk->bp);
+		break;
+	case XFS_DIR2_LEAFN_MAGIC:
+		ASSERT(XFS_DIR_IS_V2(state->mp));
+		before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp);
+		break;
+	case XFS_DA_NODE_MAGIC:
+		before = xfs_da_node_order(old_blk->bp, new_blk->bp);
+		break;
+	}
+
+	/*
+	 * Link blocks in appropriate order.
+	 */
+	if (before) {
+		/*
+		 * Link new block in before existing block.
+		 */
+		INT_SET(new_info->forw, ARCH_CONVERT, old_blk->blkno);
+		new_info->back = old_info->back; /* INT_: direct copy */
+		if (INT_GET(old_info->back, ARCH_CONVERT)) {
+			error = xfs_da_read_buf(args->trans, args->dp,
+						INT_GET(old_info->back,
+							ARCH_CONVERT), -1, &bp,
+						args->whichfork);
+			if (error)
+				return(error);
+			ASSERT(bp != NULL);
+			tmp_info = bp->data;
+			ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT) == INT_GET(old_info->magic, ARCH_CONVERT));
+			ASSERT(INT_GET(tmp_info->forw, ARCH_CONVERT) == old_blk->blkno);
+			INT_SET(tmp_info->forw, ARCH_CONVERT, new_blk->blkno);
+			xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
+			xfs_da_buf_done(bp);
+		}
+		INT_SET(old_info->back, ARCH_CONVERT, new_blk->blkno);
+	} else {
+		/*
+		 * Link new block in after existing block.
+		 */
+		new_info->forw = old_info->forw; /* INT_: direct copy */
+		INT_SET(new_info->back, ARCH_CONVERT, old_blk->blkno);
+		if (INT_GET(old_info->forw, ARCH_CONVERT)) {
+			error = xfs_da_read_buf(args->trans, args->dp,
+						INT_GET(old_info->forw, ARCH_CONVERT), -1, &bp,
+						args->whichfork);
+			if (error)
+				return(error);
+			ASSERT(bp != NULL);
+			tmp_info = bp->data;
+			ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT)
+				    == INT_GET(old_info->magic, ARCH_CONVERT));
+			ASSERT(INT_GET(tmp_info->back, ARCH_CONVERT)
+				    == old_blk->blkno);
+			INT_SET(tmp_info->back, ARCH_CONVERT, new_blk->blkno);
+			xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
+			xfs_da_buf_done(bp);
+		}
+		INT_SET(old_info->forw, ARCH_CONVERT, new_blk->blkno);
+	}
+
+	xfs_da_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
+	xfs_da_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
+	return(0);
+}
+
+/*
+ * Compare two intermediate nodes for "order".
+ */
+STATIC int
+xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp)
+{
+	xfs_da_intnode_t *node1, *node2;
+
+	node1 = node1_bp->data;
+	node2 = node2_bp->data;
+	ASSERT((INT_GET(node1->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) &&
+	       (INT_GET(node2->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC));
+	if ((INT_GET(node1->hdr.count, ARCH_CONVERT) > 0) && (INT_GET(node2->hdr.count, ARCH_CONVERT) > 0) &&
+	    ((INT_GET(node2->btree[ 0 ].hashval, ARCH_CONVERT) <
+	      INT_GET(node1->btree[ 0 ].hashval, ARCH_CONVERT)) ||
+	     (INT_GET(node2->btree[ INT_GET(node2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
+	      INT_GET(node1->btree[ INT_GET(node1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) {
+		return(1);
+	}
+	return(0);
+}
+
+/*
+ * Pick up the last hashvalue from an intermediate node.
+ */
+STATIC uint
+xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count)
+{
+	xfs_da_intnode_t *node;
+
+	node = bp->data;
+	ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+	if (count)
+		*count = INT_GET(node->hdr.count, ARCH_CONVERT);
+	if (INT_ISZERO(node->hdr.count, ARCH_CONVERT))
+		return(0);
+	return(INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT));
+}
+
+/*
+ * Unlink a block from a doubly linked list of blocks.
+ */
+int							/* error */
+xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
+				 xfs_da_state_blk_t *save_blk)
+{
+	xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info;
+	xfs_da_args_t *args;
+	xfs_dabuf_t *bp;
+	int error;
+
+	/*
+	 * Set up environment.
+	 */
+	args = state->args;
+	ASSERT(args != NULL);
+	save_info = save_blk->bp->data;
+	drop_info = drop_blk->bp->data;
+	ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
+	       save_blk->magic == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+	       save_blk->magic == XFS_ATTR_LEAF_MAGIC);
+	ASSERT(save_blk->magic == INT_GET(save_info->magic, ARCH_CONVERT));
+	ASSERT(drop_blk->magic == INT_GET(drop_info->magic, ARCH_CONVERT));
+	ASSERT(save_blk->magic == drop_blk->magic);
+	ASSERT((INT_GET(save_info->forw, ARCH_CONVERT) == drop_blk->blkno) ||
+	       (INT_GET(save_info->back, ARCH_CONVERT) == drop_blk->blkno));
+	ASSERT((INT_GET(drop_info->forw, ARCH_CONVERT) == save_blk->blkno) ||
+	       (INT_GET(drop_info->back, ARCH_CONVERT) == save_blk->blkno));
+
+	/*
+	 * Unlink the leaf block from the doubly linked chain of leaves.
+	 */
+	if (INT_GET(save_info->back, ARCH_CONVERT) == drop_blk->blkno) {
+		save_info->back = drop_info->back; /* INT_: direct copy */
+		if (INT_GET(drop_info->back, ARCH_CONVERT)) {
+			error = xfs_da_read_buf(args->trans, args->dp,
+						INT_GET(drop_info->back,
+							ARCH_CONVERT), -1, &bp,
+						args->whichfork);
+			if (error)
+				return(error);
+			ASSERT(bp != NULL);
+			tmp_info = bp->data;
+			ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT) == INT_GET(save_info->magic, ARCH_CONVERT));
+			ASSERT(INT_GET(tmp_info->forw, ARCH_CONVERT) == drop_blk->blkno);
+			INT_SET(tmp_info->forw, ARCH_CONVERT, save_blk->blkno);
+			xfs_da_log_buf(args->trans, bp, 0,
+						    sizeof(*tmp_info) - 1);
+			xfs_da_buf_done(bp);
+		}
+	} else {
+		save_info->forw = drop_info->forw; /* INT_: direct copy */
+		if (INT_GET(drop_info->forw, ARCH_CONVERT)) {
+			error = xfs_da_read_buf(args->trans, args->dp,
+						INT_GET(drop_info->forw, ARCH_CONVERT), -1, &bp,
+						args->whichfork);
+			if (error)
+				return(error);
+			ASSERT(bp != NULL);
+			tmp_info = bp->data;
+			ASSERT(INT_GET(tmp_info->magic, ARCH_CONVERT)
+				    == INT_GET(save_info->magic, ARCH_CONVERT));
+			ASSERT(INT_GET(tmp_info->back, ARCH_CONVERT)
+				    == drop_blk->blkno);
+			INT_SET(tmp_info->back, ARCH_CONVERT, save_blk->blkno);
+			xfs_da_log_buf(args->trans, bp, 0,
+						    sizeof(*tmp_info) - 1);
+			xfs_da_buf_done(bp);
+		}
+	}
+
+	xfs_da_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
+	return(0);
+}
+
+/*
+ * Move a path "forward" or "!forward" one block at the current level.
+ *
+ * This routine will adjust a "path" to point to the next block
+ * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the
+ * Btree, including updating pointers to the intermediate nodes between
+ * the new bottom and the root.
+ */
+int							/* error */
+xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
+				 int forward, int release, int *result)
+{
+	xfs_da_state_blk_t *blk;
+	xfs_da_blkinfo_t *info;
+	xfs_da_intnode_t *node;
+	xfs_da_args_t *args;
+	xfs_dablk_t blkno=0;
+	int level, error;
+
+	/*
+	 * Roll up the Btree looking for the first block where our
+	 * current index is not at the edge of the block.  Note that
+	 * we skip the bottom layer because we want the sibling block.
+	 */
+	args = state->args;
+	ASSERT(args != NULL);
+	ASSERT(path != NULL);
+	ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+	level = (path->active-1) - 1;	/* skip bottom layer in path */
+	for (blk = &path->blk[level]; level >= 0; blk--, level--) {
+		ASSERT(blk->bp != NULL);
+		node = blk->bp->data;
+		ASSERT(INT_GET(node->hdr.info.magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+		if (forward && (blk->index < INT_GET(node->hdr.count, ARCH_CONVERT)-1)) {
+			blk->index++;
+			blkno = INT_GET(node->btree[ blk->index ].before, ARCH_CONVERT);
+			break;
+		} else if (!forward && (blk->index > 0)) {
+			blk->index--;
+			blkno = INT_GET(node->btree[ blk->index ].before, ARCH_CONVERT);
+			break;
+		}
+	}
+	if (level < 0) {
+		*result = XFS_ERROR(ENOENT);	/* we're out of our tree */
+		ASSERT(args->oknoent);
+		return(0);
+	}
+
+	/*
+	 * Roll down the edge of the subtree until we reach the
+	 * same depth we were at originally.
+	 */
+	for (blk++, level++; level < path->active; blk++, level++) {
+		/*
+		 * Release the old block.
+		 * (if it's dirty, trans won't actually let go)
+		 */
+		if (release)
+			xfs_da_brelse(args->trans, blk->bp);
+
+		/*
+		 * Read the next child block.
+		 */
+		blk->blkno = blkno;
+		error = xfs_da_read_buf(args->trans, args->dp, blkno, -1,
+						     &blk->bp, args->whichfork);
+		if (error)
+			return(error);
+		ASSERT(blk->bp != NULL);
+		info = blk->bp->data;
+		ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC ||
+		       INT_GET(info->magic, ARCH_CONVERT) == XFS_DIRX_LEAF_MAGIC(state->mp) ||
+		       INT_GET(info->magic, ARCH_CONVERT) == XFS_ATTR_LEAF_MAGIC);
+		blk->magic = INT_GET(info->magic, ARCH_CONVERT);
+		if (INT_GET(info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC) {
+			node = (xfs_da_intnode_t *)info;
+			blk->hashval = INT_GET(node->btree[ INT_GET(node->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+			if (forward)
+				blk->index = 0;
+			else
+				blk->index = INT_GET(node->hdr.count, ARCH_CONVERT)-1;
+			blkno = INT_GET(node->btree[ blk->index ].before, ARCH_CONVERT);
+		} else {
+			ASSERT(level == path->active-1);
+			blk->index = 0;
+			switch(blk->magic) {
+#ifdef __KERNEL__
+			case XFS_ATTR_LEAF_MAGIC:
+				blk->hashval = xfs_attr_leaf_lasthash(blk->bp,
+								      NULL);
+				break;
+#endif
+			case XFS_DIR_LEAF_MAGIC:
+				ASSERT(XFS_DIR_IS_V1(state->mp));
+				blk->hashval = xfs_dir_leaf_lasthash(blk->bp,
+								     NULL);
+				break;
+			case XFS_DIR2_LEAFN_MAGIC:
+				ASSERT(XFS_DIR_IS_V2(state->mp));
+				blk->hashval = xfs_dir2_leafn_lasthash(blk->bp,
+								       NULL);
+				break;
+			default:
+				ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC ||
+				       blk->magic ==
+				       XFS_DIRX_LEAF_MAGIC(state->mp));
+				break;
+			}
+		}
+	}
+	*result = 0;
+	return(0);
+}
+
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Implement a simple hash on a character string.
+ * Rotate the hash value by 7 bits, then XOR each character in.
+ * This is implemented with some source-level loop unrolling.
+ */
+xfs_dahash_t
+xfs_da_hashname(uchar_t *name, int namelen)
+{
+	xfs_dahash_t hash;
+
+#define ROTL(x,y)	(((x) << (y)) | ((x) >> (32 - (y))))
+#ifdef SLOWVERSION
+	/*
+	 * This is the old one-byte-at-a-time version.
+	 */
+	for (hash = 0; namelen > 0; namelen--) {
+		hash = *name++ ^ ROTL(hash, 7);
+	}
+	return(hash);
+#else
+	/*
+	 * Do four characters at a time as long as we can.
+	 */
+	for (hash = 0; namelen >= 4; namelen -= 4, name += 4) {
+		hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
+		       (name[3] << 0) ^ ROTL(hash, 7 * 4);
+	}
+	/*
+	 * Now do the rest of the characters.
+	 */
+	switch (namelen) {
+	case 3:
+		return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
+		       ROTL(hash, 7 * 3);
+	case 2:
+		return (name[0] << 7) ^ (name[1] << 0) ^ ROTL(hash, 7 * 2);
+	case 1:
+		return (name[0] << 0) ^ ROTL(hash, 7 * 1);
+	case 0:
+		return hash;
+	}
+	/* NOTREACHED */
+#endif
+#undef ROTL
+	return 0; /* keep gcc happy */
+}
+
+/*
+ * Add a block to the btree ahead of the file.
+ * Return the new block number to the caller.
+ */
+int
+xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
+{
+	xfs_fileoff_t bno, b;
+	xfs_bmbt_irec_t map;
+	xfs_bmbt_irec_t *mapp;
+	xfs_inode_t *dp;
+	int nmap, error, w, count, c, got, i, mapi;
+	xfs_fsize_t size;
+	xfs_trans_t *tp;
+	xfs_mount_t *mp;
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	w = args->whichfork;
+	tp = args->trans;
+	/*
+	 * For new directories adjust the file offset and block count.
+	 */
+	if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp)) {
+		bno = mp->m_dirleafblk;
+		count = mp->m_dirblkfsbs;
+	} else {
+		bno = 0;
+		count = 1;
+	}
+	/*
+	 * Find a spot in the file space to put the new block.
+	 */
+	if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, w))) {
+		return error;
+	}
+	if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
+		ASSERT(bno >= mp->m_dirleafblk && bno < mp->m_dirfreeblk);
+	/*
+	 * Try mapping it in one filesystem block.
+	 */
+	nmap = 1;
+	ASSERT(args->firstblock != NULL);
+	if ((error = xfs_bmapi(tp, dp, bno, count,
+			XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
+			XFS_BMAPI_CONTIG,
+			args->firstblock, args->total, &map, &nmap,
+			args->flist))) {
+		return error;
+	}
+	ASSERT(nmap <= 1);
+	if (nmap == 1) {
+		mapp = &map;
+		mapi = 1;
+	}
+	/*
+	 * If we didn't get it and the block might work if fragmented,
+	 * try without the CONTIG flag.	 Loop until we get it all.
+	 */
+	else if (nmap == 0 && count > 1) {
+		mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
+		for (b = bno, mapi = 0; b < bno + count; ) {
+			nmap = MIN(XFS_BMAP_MAX_NMAP, count);
+			c = (int)(bno + count - b);
+			if ((error = xfs_bmapi(tp, dp, b, c,
+					XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|
+					XFS_BMAPI_METADATA,
+					args->firstblock, args->total,
+					&mapp[mapi], &nmap, args->flist))) {
+				kmem_free(mapp, sizeof(*mapp) * count);
+				return error;
+			}
+			if (nmap < 1)
+				break;
+			mapi += nmap;
+			b = mapp[mapi - 1].br_startoff +
+			    mapp[mapi - 1].br_blockcount;
+		}
+	} else {
+		mapi = 0;
+		mapp = NULL;
+	}
+	/*
+	 * Count the blocks we got, make sure it matches the total.
+	 */
+	for (i = 0, got = 0; i < mapi; i++)
+		got += mapp[i].br_blockcount;
+	if (got != count || mapp[0].br_startoff != bno ||
+	    mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
+	    bno + count) {
+		if (mapp != &map)
+			kmem_free(mapp, sizeof(*mapp) * count);
+		return XFS_ERROR(ENOSPC);
+	}
+	if (mapp != &map)
+		kmem_free(mapp, sizeof(*mapp) * count);
+	*new_blkno = (xfs_dablk_t)bno;
+	/*
+	 * For version 1 directories, adjust the file size if it changed.
+	 */
+	if (w == XFS_DATA_FORK && XFS_DIR_IS_V1(mp)) {
+		ASSERT(mapi == 1);
+		if ((error = xfs_bmap_last_offset(tp, dp, &bno, w)))
+			return error;
+		size = XFS_FSB_TO_B(mp, bno);
+		if (size != dp->i_d.di_size) {
+			dp->i_d.di_size = size;
+			xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+		}
+	}
+	return 0;
+}
+
+/*
+ * Ick.	 We need to always be able to remove a btree block, even
+ * if there's no space reservation because the filesystem is full.
+ * This is called if xfs_bunmapi on a btree block fails due to ENOSPC.
+ * It swaps the target block with the last block in the file.  The
+ * last block in the file can always be removed since it can't cause
+ * a bmap btree split to do that.
+ */
+STATIC int
+xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop,
+		      xfs_dabuf_t **dead_bufp)
+{
+	xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno;
+	xfs_dabuf_t *dead_buf, *last_buf, *sib_buf, *par_buf;
+	xfs_fileoff_t lastoff;
+	xfs_inode_t *ip;
+	xfs_trans_t *tp;
+	xfs_mount_t *mp;
+	int error, w, entno, level, dead_level;
+	xfs_da_blkinfo_t *dead_info, *sib_info;
+	xfs_da_intnode_t *par_node, *dead_node;
+	xfs_dir_leafblock_t *dead_leaf;
+	xfs_dir2_leaf_t *dead_leaf2;
+	xfs_dahash_t dead_hash;
+
+	dead_buf = *dead_bufp;
+	dead_blkno = *dead_blknop;
+	tp = args->trans;
+	ip = args->dp;
+	w = args->whichfork;
+	ASSERT(w == XFS_DATA_FORK);
+	mp = ip->i_mount;
+	if (XFS_DIR_IS_V2(mp)) {
+		lastoff = mp->m_dirfreeblk;
+		error = xfs_bmap_last_before(tp, ip, &lastoff, w);
+	} else
+		error = xfs_bmap_last_offset(tp, ip, &lastoff, w);
+	if (error)
+		return error;
+	if (lastoff == 0)
+		return XFS_ERROR(EFSCORRUPTED);
+	/*
+	 * Read the last block in the btree space.
+	 */
+	last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
+	if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w)))
+		return error;
+	/*
+	 * Copy the last block into the dead buffer and log it.
+	 */
+	bcopy(last_buf->data, dead_buf->data, mp->m_dirblksize);
+	xfs_da_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1);
+	dead_info = dead_buf->data;
+	/*
+	 * Get values from the moved block.
+	 */
+	if (INT_GET(dead_info->magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC) {
+		ASSERT(XFS_DIR_IS_V1(mp));
+		dead_leaf = (xfs_dir_leafblock_t *)dead_info;
+		dead_level = 0;
+		dead_hash =
+			INT_GET(dead_leaf->entries[INT_GET(dead_leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
+	} else if (INT_GET(dead_info->magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC) {
+		ASSERT(XFS_DIR_IS_V2(mp));
+		dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
+		dead_level = 0;
+		dead_hash = INT_GET(dead_leaf2->ents[INT_GET(dead_leaf2->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
+	} else {
+		ASSERT(INT_GET(dead_info->magic, ARCH_CONVERT) == XFS_DA_NODE_MAGIC);
+		dead_node = (xfs_da_intnode_t *)dead_info;
+		dead_level = INT_GET(dead_node->hdr.level, ARCH_CONVERT);
+		dead_hash = INT_GET(dead_node->btree[INT_GET(dead_node->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
+	}
+	sib_buf = par_buf = NULL;
+	/*
+	 * If the moved block has a left sibling, fix up the pointers.
+	 */
+	if ((sib_blkno = INT_GET(dead_info->back, ARCH_CONVERT))) {
+		if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+			goto done;
+		sib_info = sib_buf->data;
+		if (INT_GET(sib_info->forw, ARCH_CONVERT) != last_blkno ||
+		    INT_GET(sib_info->magic, ARCH_CONVERT) != INT_GET(dead_info->magic, ARCH_CONVERT)) {
+			error = XFS_ERROR(EFSCORRUPTED);
+			goto done;
+		}
+		INT_SET(sib_info->forw, ARCH_CONVERT, dead_blkno);
+		xfs_da_log_buf(tp, sib_buf,
+			XFS_DA_LOGRANGE(sib_info, &sib_info->forw,
+					sizeof(sib_info->forw)));
+		xfs_da_buf_done(sib_buf);
+		sib_buf = NULL;
+	}
+	/*
+	 * If the moved block has a right sibling, fix up the pointers.
+	 */
+	if ((sib_blkno = INT_GET(dead_info->forw, ARCH_CONVERT))) {
+		if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+			goto done;
+		sib_info = sib_buf->data;
+		if (   INT_GET(sib_info->back, ARCH_CONVERT) != last_blkno
+		    || INT_GET(sib_info->magic, ARCH_CONVERT)
+				!= INT_GET(dead_info->magic, ARCH_CONVERT)) {
+			error = XFS_ERROR(EFSCORRUPTED);
+			goto done;
+		}
+		INT_SET(sib_info->back, ARCH_CONVERT, dead_blkno);
+		xfs_da_log_buf(tp, sib_buf,
+			XFS_DA_LOGRANGE(sib_info, &sib_info->back,
+					sizeof(sib_info->back)));
+		xfs_da_buf_done(sib_buf);
+		sib_buf = NULL;
+	}
+	par_blkno = XFS_DIR_IS_V1(mp) ? 0 : mp->m_dirleafblk;
+	level = -1;
+	/*
+	 * Walk down the tree looking for the parent of the moved block.
+	 */
+	for (;;) {
+		if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+			goto done;
+		par_node = par_buf->data;
+		if (INT_GET(par_node->hdr.info.magic, ARCH_CONVERT) != XFS_DA_NODE_MAGIC ||
+		    (level >= 0 && level != INT_GET(par_node->hdr.level, ARCH_CONVERT) + 1)) {
+			error = XFS_ERROR(EFSCORRUPTED);
+			goto done;
+		}
+		level = INT_GET(par_node->hdr.level, ARCH_CONVERT);
+		for (entno = 0;
+		     entno < INT_GET(par_node->hdr.count, ARCH_CONVERT) &&
+		     INT_GET(par_node->btree[entno].hashval, ARCH_CONVERT) < dead_hash;
+		     entno++)
+			continue;
+		if (entno == INT_GET(par_node->hdr.count, ARCH_CONVERT)) {
+			error = XFS_ERROR(EFSCORRUPTED);
+			goto done;
+		}
+		par_blkno = INT_GET(par_node->btree[entno].before, ARCH_CONVERT);
+		if (level == dead_level + 1)
+			break;
+		xfs_da_brelse(tp, par_buf);
+		par_buf = NULL;
+	}
+	/*
+	 * We're in the right parent block.
+	 * Look for the right entry.
+	 */
+	for (;;) {
+		for (;
+		     entno < INT_GET(par_node->hdr.count, ARCH_CONVERT) &&
+		     INT_GET(par_node->btree[entno].before, ARCH_CONVERT) != last_blkno;
+		     entno++)
+			continue;
+		if (entno < INT_GET(par_node->hdr.count, ARCH_CONVERT))
+			break;
+		par_blkno = INT_GET(par_node->hdr.info.forw, ARCH_CONVERT);
+		xfs_da_brelse(tp, par_buf);
+		par_buf = NULL;
+		if (par_blkno == 0) {
+			error = XFS_ERROR(EFSCORRUPTED);
+			goto done;
+		}
+		if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+			goto done;
+		par_node = par_buf->data;
+		if (INT_GET(par_node->hdr.level, ARCH_CONVERT) != level ||
+		    INT_GET(par_node->hdr.info.magic, ARCH_CONVERT) != XFS_DA_NODE_MAGIC) {
+			error = XFS_ERROR(EFSCORRUPTED);
+			goto done;
+		}
+		entno = 0;
+	}
+	/*
+	 * Update the parent entry pointing to the moved block.
+	 */
+	INT_SET(par_node->btree[entno].before, ARCH_CONVERT, dead_blkno);
+	xfs_da_log_buf(tp, par_buf,
+		XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before,
+				sizeof(par_node->btree[entno].before)));
+	xfs_da_buf_done(par_buf);
+	xfs_da_buf_done(dead_buf);
+	*dead_blknop = last_blkno;
+	*dead_bufp = last_buf;
+	return 0;
+done:
+	if (par_buf)
+		xfs_da_brelse(tp, par_buf);
+	if (sib_buf)
+		xfs_da_brelse(tp, sib_buf);
+	xfs_da_brelse(tp, last_buf);
+	return error;
+}
+
+/*
+ * Remove a btree block from a directory or attribute.
+ */
+int
+xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
+		    xfs_dabuf_t *dead_buf)
+{
+	xfs_inode_t *dp;
+	int done, error, w, count;
+	xfs_fileoff_t bno;
+	xfs_fsize_t size;
+	xfs_trans_t *tp;
+	xfs_mount_t *mp;
+
+	dp = args->dp;
+	w = args->whichfork;
+	tp = args->trans;
+	mp = dp->i_mount;
+	if (w == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
+		count = mp->m_dirblkfsbs;
+	else
+		count = 1;
+	for (;;) {
+		/*
+		 * Remove extents.  If we get ENOSPC for a dir we have to move
+		 * the last block to the place we want to kill.
+		 */
+		if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
+				XFS_BMAPI_AFLAG(w)|XFS_BMAPI_METADATA,
+				0, args->firstblock, args->flist,
+				&done)) == ENOSPC) {
+			if (w != XFS_DATA_FORK)
+				goto done;
+			if ((error = xfs_da_swap_lastblock(args, &dead_blkno,
+					&dead_buf)))
+				goto done;
+		} else if (error)
+			goto done;
+		else
+			break;
+	}
+	ASSERT(done);
+	xfs_da_binval(tp, dead_buf);
+	/*
+	 * Adjust the directory size for version 1.
+	 */
+	if (w == XFS_DATA_FORK && XFS_DIR_IS_V1(mp)) {
+		if ((error = xfs_bmap_last_offset(tp, dp, &bno, w)))
+			return error;
+		size = XFS_FSB_TO_B(dp->i_mount, bno);
+		if (size != dp->i_d.di_size) {
+			dp->i_d.di_size = size;
+			xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+		}
+	}
+	return 0;
+done:
+	xfs_da_binval(tp, dead_buf);
+	return error;
+}
+
+/*
+ * See if the mapping(s) for this btree block are valid, i.e.
+ * don't contain holes, are logically contiguous, and cover the whole range.
+ */
+STATIC int
+xfs_da_map_covers_blocks(
+	int		nmap,
+	xfs_bmbt_irec_t *mapp,
+	xfs_dablk_t	bno,
+	int		count)
+{
+	int		i;
+	xfs_fileoff_t	off;
+
+	for (i = 0, off = bno; i < nmap; i++) {
+		if (mapp[i].br_startblock == HOLESTARTBLOCK ||
+		    mapp[i].br_startblock == DELAYSTARTBLOCK) {
+			return 0;
+		}
+		if (off != mapp[i].br_startoff) {
+			return 0;
+		}
+		off += mapp[i].br_blockcount;
+	}
+	return off == bno + count;
+}
+
+/*
+ * Make a dabuf.
+ * Used for get_buf, read_buf, read_bufr, and reada_buf.
+ */
+STATIC int
+xfs_da_do_buf(
+	xfs_trans_t	*trans,
+	xfs_inode_t	*dp,
+	xfs_dablk_t	bno,
+	xfs_daddr_t	*mappedbnop,
+	xfs_dabuf_t	**bpp,
+	int		whichfork,
+	int		caller,
+	inst_t		*ra)
+{
+	xfs_buf_t	*bp = 0;
+	xfs_buf_t	**bplist;
+	int		error=0;
+	int		i;
+	xfs_bmbt_irec_t map;
+	xfs_bmbt_irec_t *mapp;
+	xfs_daddr_t	mappedbno;
+	xfs_mount_t	*mp;
+	int		nbplist=0;
+	int		nfsb;
+	int		nmap;
+	xfs_dabuf_t	*rbp;
+
+	mp = dp->i_mount;
+	if (whichfork == XFS_DATA_FORK && XFS_DIR_IS_V2(mp))
+		nfsb = mp->m_dirblkfsbs;
+	else
+		nfsb = 1;
+	mappedbno = *mappedbnop;
+	/*
+	 * Caller doesn't have a mapping.  -2 means don't complain
+	 * if we land in a hole.
+	 */
+	if (mappedbno == -1 || mappedbno == -2) {
+		/*
+		 * Optimize the one-block case.
+		 */
+		if (nfsb == 1) {
+			xfs_fsblock_t	fsb;
+
+			if ((error =
+			    xfs_bmapi_single(trans, dp, whichfork, &fsb,
+				    (xfs_fileoff_t)bno))) {
+				return error;
+			}
+			mapp = &map;
+			if (fsb == NULLFSBLOCK) {
+				nmap = 0;
+			} else {
+				map.br_startblock = fsb;
+				map.br_startoff = (xfs_fileoff_t)bno;
+				map.br_blockcount = 1;
+				nmap = 1;
+			}
+		} else {
+			mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP);
+			nmap = nfsb;
+			if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno,
+					nfsb,
+					XFS_BMAPI_METADATA |
+						XFS_BMAPI_AFLAG(whichfork),
+					NULL, 0, mapp, &nmap, NULL)))
+				goto exit0;
+		}
+	} else {
+		map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
+		map.br_startoff = (xfs_fileoff_t)bno;
+		map.br_blockcount = nfsb;
+		mapp = &map;
+		nmap = 1;
+	}
+	if (!xfs_da_map_covers_blocks(nmap, mapp, bno, nfsb)) {
+		error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED);
+		goto exit0;
+	}
+	if (caller != 3 && nmap > 1) {
+		bplist = kmem_alloc(sizeof(*bplist) * nmap, KM_SLEEP);
+		nbplist = 0;
+	} else
+		bplist = NULL;
+	/*
+	 * Turn the mapping(s) into buffer(s).
+	 */
+	for (i = 0; i < nmap; i++) {
+		int	nmapped;
+
+		mappedbno = XFS_FSB_TO_DADDR(mp, mapp[i].br_startblock);
+		if (i == 0)
+			*mappedbnop = mappedbno;
+		nmapped = (int)XFS_FSB_TO_BB(mp, mapp[i].br_blockcount);
+		switch (caller) {
+		case 0:
+			bp = xfs_trans_get_buf(trans, mp->m_ddev_targp,
+				mappedbno, nmapped, 0);
+			error = bp ? XFS_BUF_GETERROR(bp) : XFS_ERROR(EIO);
+			break;
+		case 1:
+#ifndef __KERNEL__
+		case 2:
+#endif
+			bp = NULL;
+			error = xfs_trans_read_buf(mp, trans, mp->m_ddev_targp,
+				mappedbno, nmapped, 0, &bp);
+			break;
+#ifdef __KERNEL__
+		case 3:
+			xfs_baread(mp->m_ddev_targp, mappedbno, nmapped);
+			error = 0;
+			bp = NULL;
+			break;
+#endif
+		}
+		if (error) {
+			if (bp)
+				xfs_trans_brelse(trans, bp);
+			goto exit1;
+		}
+		if (!bp)
+			continue;
+		if (caller == 1) {
+			if (whichfork == XFS_ATTR_FORK) {
+				XFS_BUF_SET_VTYPE_REF(bp, B_FS_ATTR_BTREE,
+						XFS_ATTR_BTREE_REF);
+			} else {
+				XFS_BUF_SET_VTYPE_REF(bp, B_FS_DIR_BTREE,
+						XFS_DIR_BTREE_REF);
+			}
+		}
+		if (bplist) {
+			bplist[nbplist++] = bp;
+		}
+	}
+	/*
+	 * Build a dabuf structure.
+	 */
+	if (bplist) {
+		rbp = xfs_da_buf_make(nbplist, bplist, ra);
+	} else if (bp)
+		rbp = xfs_da_buf_make(1, &bp, ra);
+	else
+		rbp = NULL;
+	/*
+	 * For read_buf, check the magic number.
+	 */
+	if (caller == 1) {
+		xfs_dir2_data_t		*data;
+		xfs_dir2_free_t		*free;
+		xfs_da_blkinfo_t	*info;
+		uint			magic, magic1;
+
+		info = rbp->data;
+		data = rbp->data;
+		free = rbp->data;
+		magic = INT_GET(info->magic, ARCH_CONVERT);
+		magic1 = INT_GET(data->hdr.magic, ARCH_CONVERT);
+		if (XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) &&
+				   (magic != XFS_DIR_LEAF_MAGIC) &&
+				   (magic != XFS_ATTR_LEAF_MAGIC) &&
+				   (magic != XFS_DIR2_LEAF1_MAGIC) &&
+				   (magic != XFS_DIR2_LEAFN_MAGIC) &&
+				   (magic1 != XFS_DIR2_BLOCK_MAGIC) &&
+				   (magic1 != XFS_DIR2_DATA_MAGIC) &&
+				   (INT_GET(free->hdr.magic, ARCH_CONVERT) != XFS_DIR2_FREE_MAGIC),
+				mp, XFS_ERRTAG_DA_READ_BUF,
+				XFS_RANDOM_DA_READ_BUF)) {
+			xfs_buftrace("DA READ ERROR", rbp->bps[0]);
+			error = XFS_ERROR(EFSCORRUPTED);
+			xfs_da_brelse(trans, rbp);
+			nbplist = 0;
+			goto exit1;
+		}
+	}
+	if (bplist) {
+		kmem_free(bplist, sizeof(*bplist) * nmap);
+	}
+	if (mapp != &map) {
+		kmem_free(mapp, sizeof(*mapp) * nfsb);
+	}
+	if (bpp)
+		*bpp = rbp;
+	return 0;
+exit1:
+	if (bplist) {
+		for (i = 0; i < nbplist; i++)
+			xfs_trans_brelse(trans, bplist[i]);
+		kmem_free(bplist, sizeof(*bplist) * nmap);
+	}
+exit0:
+	if (mapp != &map)
+		kmem_free(mapp, sizeof(*mapp) * nfsb);
+	if (bpp)
+		*bpp = NULL;
+	return error;
+}
+
+/*
+ * Get a buffer for the dir/attr block.
+ */
+int
+xfs_da_get_buf(
+	xfs_trans_t	*trans,
+	xfs_inode_t	*dp,
+	xfs_dablk_t	bno,
+	xfs_daddr_t		mappedbno,
+	xfs_dabuf_t	**bpp,
+	int		whichfork)
+{
+	return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 0,
+						 (inst_t *)__return_address);
+}
+
+/*
+ * Get a buffer for the dir/attr block, fill in the contents.
+ */
+int
+xfs_da_read_buf(
+	xfs_trans_t	*trans,
+	xfs_inode_t	*dp,
+	xfs_dablk_t	bno,
+	xfs_daddr_t		mappedbno,
+	xfs_dabuf_t	**bpp,
+	int		whichfork)
+{
+	return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 1,
+		(inst_t *)__return_address);
+}
+
+/*
+ * Readahead the dir/attr block.
+ */
+xfs_daddr_t
+xfs_da_reada_buf(
+	xfs_trans_t	*trans,
+	xfs_inode_t	*dp,
+	xfs_dablk_t	bno,
+	int		whichfork)
+{
+	xfs_daddr_t		rval;
+
+	rval = -1;
+	if (xfs_da_do_buf(trans, dp, bno, &rval, NULL, whichfork, 3,
+			(inst_t *)__return_address))
+		return -1;
+	else
+		return rval;
+}
+
+/*
+ * Calculate the number of bits needed to hold i different values.
+ */
+uint
+xfs_da_log2_roundup(uint i)
+{
+	uint rval;
+
+	for (rval = 0; rval < NBBY * sizeof(i); rval++) {
+		if ((1 << rval) >= i)
+			break;
+	}
+	return(rval);
+}
+
+kmem_zone_t *xfs_da_state_zone;	/* anchor for state struct zone */
+kmem_zone_t *xfs_dabuf_zone;		/* dabuf zone */
+
+/*
+ * Allocate a dir-state structure.
+ * We don't put them on the stack since they're large.
+ */
+xfs_da_state_t *
+xfs_da_state_alloc(void)
+{
+	return kmem_zone_zalloc(xfs_da_state_zone, KM_SLEEP);
+}
+
+/*
+ * Kill the altpath contents of a da-state structure.
+ */
+void
+xfs_da_state_kill_altpath(xfs_da_state_t *state)
+{
+	int	i;
+
+	for (i = 0; i < state->altpath.active; i++) {
+		if (state->altpath.blk[i].bp) {
+			if (state->altpath.blk[i].bp != state->path.blk[i].bp)
+				xfs_da_buf_done(state->altpath.blk[i].bp);
+			state->altpath.blk[i].bp = NULL;
+		}
+	}
+	state->altpath.active = 0;
+}
+
+/*
+ * Free a da-state structure.
+ */
+void
+xfs_da_state_free(xfs_da_state_t *state)
+{
+	int	i;
+
+	xfs_da_state_kill_altpath(state);
+	for (i = 0; i < state->path.active; i++) {
+		if (state->path.blk[i].bp)
+			xfs_da_buf_done(state->path.blk[i].bp);
+	}
+	if (state->extravalid && state->extrablk.bp)
+		xfs_da_buf_done(state->extrablk.bp);
+#ifdef DEBUG
+	bzero((char *)state, sizeof(*state));
+#endif /* DEBUG */
+	kmem_zone_free(xfs_da_state_zone, state);
+}
+
+#ifdef XFS_DABUF_DEBUG
+xfs_dabuf_t	*xfs_dabuf_global_list;
+lock_t		xfs_dabuf_global_lock;
+#endif
+
+/*
+ * Create a dabuf.
+ */
+/* ARGSUSED */
+STATIC xfs_dabuf_t *
+xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra)
+{
+	xfs_buf_t	*bp;
+	xfs_dabuf_t	*dabuf;
+	int		i;
+	int		off;
+
+	if (nbuf == 1)
+		dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_SLEEP);
+	else
+		dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_SLEEP);
+	dabuf->dirty = 0;
+#ifdef XFS_DABUF_DEBUG
+	dabuf->ra = ra;
+	dabuf->dev = XFS_BUF_TARGET_DEV(bps[0]);
+	dabuf->blkno = XFS_BUF_ADDR(bps[0]);
+#endif
+	if (nbuf == 1) {
+		dabuf->nbuf = 1;
+		bp = bps[0];
+		dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp));
+		dabuf->data = XFS_BUF_PTR(bp);
+		dabuf->bps[0] = bp;
+	} else {
+		dabuf->nbuf = nbuf;
+		for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) {
+			dabuf->bps[i] = bp = bps[i];
+			dabuf->bbcount += BTOBB(XFS_BUF_COUNT(bp));
+		}
+		dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP);
+		for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) {
+			bp = bps[i];
+			bcopy(XFS_BUF_PTR(bp), (char *)dabuf->data + off,
+				XFS_BUF_COUNT(bp));
+		}
+	}
+#ifdef XFS_DABUF_DEBUG
+	{
+		SPLDECL(s);
+		xfs_dabuf_t	*p;
+
+		s = mutex_spinlock(&xfs_dabuf_global_lock);
+		for (p = xfs_dabuf_global_list; p; p = p->next) {
+			ASSERT(p->blkno != dabuf->blkno ||
+			       p->dev != dabuf->dev);
+		}
+		dabuf->prev = NULL;
+		if (xfs_dabuf_global_list)
+			xfs_dabuf_global_list->prev = dabuf;
+		dabuf->next = xfs_dabuf_global_list;
+		xfs_dabuf_global_list = dabuf;
+		mutex_spinunlock(&xfs_dabuf_global_lock, s);
+	}
+#endif
+	return dabuf;
+}
+
+/*
+ * Un-dirty a dabuf.
+ */
+STATIC void
+xfs_da_buf_clean(xfs_dabuf_t *dabuf)
+{
+	xfs_buf_t	*bp;
+	int		i;
+	int		off;
+
+	if (dabuf->dirty) {
+		ASSERT(dabuf->nbuf > 1);
+		dabuf->dirty = 0;
+		for (i = off = 0; i < dabuf->nbuf;
+				i++, off += XFS_BUF_COUNT(bp)) {
+			bp = dabuf->bps[i];
+			bcopy((char *)dabuf->data + off, XFS_BUF_PTR(bp),
+				XFS_BUF_COUNT(bp));
+		}
+	}
+}
+
+/*
+ * Release a dabuf.
+ */
+void
+xfs_da_buf_done(xfs_dabuf_t *dabuf)
+{
+	ASSERT(dabuf);
+	ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
+	if (dabuf->dirty)
+		xfs_da_buf_clean(dabuf);
+	if (dabuf->nbuf > 1)
+		kmem_free(dabuf->data, BBTOB(dabuf->bbcount));
+#ifdef XFS_DABUF_DEBUG
+	{
+		SPLDECL(s);
+
+		s = mutex_spinlock(&xfs_dabuf_global_lock);
+		if (dabuf->prev)
+			dabuf->prev->next = dabuf->next;
+		else
+			xfs_dabuf_global_list = dabuf->next;
+		if (dabuf->next)
+			dabuf->next->prev = dabuf->prev;
+		mutex_spinunlock(&xfs_dabuf_global_lock, s);
+	}
+	bzero(dabuf, XFS_DA_BUF_SIZE(dabuf->nbuf));
+#endif
+	if (dabuf->nbuf == 1)
+		kmem_zone_free(xfs_dabuf_zone, dabuf);
+	else
+		kmem_free(dabuf, XFS_DA_BUF_SIZE(dabuf->nbuf));
+}
+
+/*
+ * Log transaction from a dabuf.
+ */
+void
+xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last)
+{
+	xfs_buf_t	*bp;
+	uint		f;
+	int		i;
+	uint		l;
+	int		off;
+
+	ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
+	if (dabuf->nbuf == 1) {
+		ASSERT(dabuf->data == (void *)XFS_BUF_PTR(dabuf->bps[0]));
+		xfs_trans_log_buf(tp, dabuf->bps[0], first, last);
+		return;
+	}
+	dabuf->dirty = 1;
+	ASSERT(first <= last);
+	for (i = off = 0; i < dabuf->nbuf; i++, off += XFS_BUF_COUNT(bp)) {
+		bp = dabuf->bps[i];
+		f = off;
+		l = f + XFS_BUF_COUNT(bp) - 1;
+		if (f < first)
+			f = first;
+		if (l > last)
+			l = last;
+		if (f <= l)
+			xfs_trans_log_buf(tp, bp, f - off, l - off);
+		/*
+		 * B_DONE is set by xfs_trans_log buf.
+		 * If we don't set it on a new buffer (get not read)
+		 * then if we don't put anything in the buffer it won't
+		 * be set, and at commit it it released into the cache,
+		 * and then a read will fail.
+		 */
+		else if (!(XFS_BUF_ISDONE(bp)))
+		  XFS_BUF_DONE(bp);
+	}
+	ASSERT(last < off);
+}
+
+/*
+ * Release dabuf from a transaction.
+ * Have to free up the dabuf before the buffers are released,
+ * since the synchronization on the dabuf is really the lock on the buffer.
+ */
+void
+xfs_da_brelse(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
+{
+	xfs_buf_t	*bp;
+	xfs_buf_t	**bplist;
+	int		i;
+	int		nbuf;
+
+	ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
+	if ((nbuf = dabuf->nbuf) == 1) {
+		bplist = &bp;
+		bp = dabuf->bps[0];
+	} else {
+		bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP);
+		bcopy(dabuf->bps, bplist, nbuf * sizeof(*bplist));
+	}
+	xfs_da_buf_done(dabuf);
+	for (i = 0; i < nbuf; i++)
+		xfs_trans_brelse(tp, bplist[i]);
+	if (bplist != &bp)
+		kmem_free(bplist, nbuf * sizeof(*bplist));
+}
+
+/*
+ * Invalidate dabuf from a transaction.
+ */
+void
+xfs_da_binval(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
+{
+	xfs_buf_t	*bp;
+	xfs_buf_t	**bplist;
+	int		i;
+	int		nbuf;
+
+	ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]);
+	if ((nbuf = dabuf->nbuf) == 1) {
+		bplist = &bp;
+		bp = dabuf->bps[0];
+	} else {
+		bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP);
+		bcopy(dabuf->bps, bplist, nbuf * sizeof(*bplist));
+	}
+	xfs_da_buf_done(dabuf);
+	for (i = 0; i < nbuf; i++)
+		xfs_trans_binval(tp, bplist[i]);
+	if (bplist != &bp)
+		kmem_free(bplist, nbuf * sizeof(*bplist));
+}
+
+/*
+ * Get the first daddr from a dabuf.
+ */
+xfs_daddr_t
+xfs_da_blkno(xfs_dabuf_t *dabuf)
+{
+	ASSERT(dabuf->nbuf);
+	ASSERT(dabuf->data);
+	return XFS_BUF_ADDR(dabuf->bps[0]);
+}
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
new file mode 100644
index 000000000000..88bfbf9e1dc4
--- /dev/null
+++ b/fs/xfs/xfs_da_btree.h
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DA_BTREE_H__
+#define __XFS_DA_BTREE_H__
+
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+struct zone;
+
+/*========================================================================
+ * Directory Structure when greater than XFS_LBSIZE(mp) bytes.
+ *========================================================================*/
+
+/*
+ * This structure is common to both leaf nodes and non-leaf nodes in the Btree.
+ *
+ * Is is used to manage a doubly linked list of all blocks at the same
+ * level in the Btree, and to identify which type of block this is.
+ */
+#define XFS_DA_NODE_MAGIC	0xfebe	/* magic number: non-leaf blocks */
+#define XFS_DIR_LEAF_MAGIC	0xfeeb	/* magic number: directory leaf blks */
+#define XFS_ATTR_LEAF_MAGIC	0xfbee	/* magic number: attribute leaf blks */
+#define XFS_DIR2_LEAF1_MAGIC	0xd2f1	/* magic number: v2 dirlf single blks */
+#define XFS_DIR2_LEAFN_MAGIC	0xd2ff	/* magic number: v2 dirlf multi blks */
+
+#define XFS_DIRX_LEAF_MAGIC(mp) \
+	(XFS_DIR_IS_V1(mp) ? XFS_DIR_LEAF_MAGIC : XFS_DIR2_LEAFN_MAGIC)
+
+typedef struct xfs_da_blkinfo {
+	xfs_dablk_t forw;			/* previous block in list */
+	xfs_dablk_t back;			/* following block in list */
+	__uint16_t magic;			/* validity check on block */
+	__uint16_t pad;				/* unused */
+} xfs_da_blkinfo_t;
+
+/*
+ * This is the structure of the root and intermediate nodes in the Btree.
+ * The leaf nodes are defined above.
+ *
+ * Entries are not packed.
+ *
+ * Since we have duplicate keys, use a binary search but always follow
+ * all match in the block, not just the first match found.
+ */
+#define XFS_DA_NODE_MAXDEPTH	5	/* max depth of Btree */
+
+typedef struct xfs_da_intnode {
+	struct xfs_da_node_hdr {	/* constant-structure header block */
+		xfs_da_blkinfo_t info;	/* block type, links, etc. */
+		__uint16_t count;	/* count of active entries */
+		__uint16_t level;	/* level above leaves (leaf == 0) */
+	} hdr;
+	struct xfs_da_node_entry {
+		xfs_dahash_t hashval;	/* hash value for this descendant */
+		xfs_dablk_t before;	/* Btree block before this key */
+	} btree[1];			/* variable sized array of keys */
+} xfs_da_intnode_t;
+typedef struct xfs_da_node_hdr xfs_da_node_hdr_t;
+typedef struct xfs_da_node_entry xfs_da_node_entry_t;
+
+#define XFS_DA_NODE_ENTSIZE_BYNAME	/* space a name uses */ \
+	(sizeof(xfs_da_node_entry_t))
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_NODE_ENTRIES)
+int xfs_da_node_entries(struct xfs_mount *mp);
+#define XFS_DA_NODE_ENTRIES(mp)		xfs_da_node_entries(mp)
+#else
+#define XFS_DA_NODE_ENTRIES(mp)		((mp)->m_da_node_ents)
+#endif
+
+#define XFS_DA_MAXHASH	((xfs_dahash_t)-1) /* largest valid hash value */
+
+/*
+ * Macros used by directory code to interface to the filesystem.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LBSIZE)
+int xfs_lbsize(struct xfs_mount *mp);
+#define XFS_LBSIZE(mp)			xfs_lbsize(mp)
+#else
+#define XFS_LBSIZE(mp)	((mp)->m_sb.sb_blocksize)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LBLOG)
+int xfs_lblog(struct xfs_mount *mp);
+#define XFS_LBLOG(mp)			xfs_lblog(mp)
+#else
+#define XFS_LBLOG(mp)	((mp)->m_sb.sb_blocklog)
+#endif
+
+/*
+ * Macros used by directory code to interface to the kernel
+ */
+
+/*
+ * Macros used to manipulate directory off_t's
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_MAKE_BNOENTRY)
+__uint32_t xfs_da_make_bnoentry(struct xfs_mount *mp, xfs_dablk_t bno,
+				int entry);
+#define XFS_DA_MAKE_BNOENTRY(mp,bno,entry)	\
+	xfs_da_make_bnoentry(mp,bno,entry)
+#else
+#define XFS_DA_MAKE_BNOENTRY(mp,bno,entry) \
+	(((bno) << (mp)->m_dircook_elog) | (entry))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_MAKE_COOKIE)
+xfs_off_t xfs_da_make_cookie(struct xfs_mount *mp, xfs_dablk_t bno, int entry,
+				xfs_dahash_t hash);
+#define XFS_DA_MAKE_COOKIE(mp,bno,entry,hash)	\
+	xfs_da_make_cookie(mp,bno,entry,hash)
+#else
+#define XFS_DA_MAKE_COOKIE(mp,bno,entry,hash) \
+	(((xfs_off_t)XFS_DA_MAKE_BNOENTRY(mp, bno, entry) << 32) | (hash))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_COOKIE_HASH)
+xfs_dahash_t xfs_da_cookie_hash(struct xfs_mount *mp, xfs_off_t cookie);
+#define XFS_DA_COOKIE_HASH(mp,cookie)		xfs_da_cookie_hash(mp,cookie)
+#else
+#define XFS_DA_COOKIE_HASH(mp,cookie)	((xfs_dahash_t)(cookie))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_COOKIE_BNO)
+xfs_dablk_t xfs_da_cookie_bno(struct xfs_mount *mp, xfs_off_t cookie);
+#define XFS_DA_COOKIE_BNO(mp,cookie)		xfs_da_cookie_bno(mp,cookie)
+#else
+#define XFS_DA_COOKIE_BNO(mp,cookie) \
+	(((xfs_off_t)(cookie) >> 31) == -1LL ? \
+		(xfs_dablk_t)0 : \
+		(xfs_dablk_t)((xfs_off_t)(cookie) >> ((mp)->m_dircook_elog + 32)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DA_COOKIE_ENTRY)
+int xfs_da_cookie_entry(struct xfs_mount *mp, xfs_off_t cookie);
+#define XFS_DA_COOKIE_ENTRY(mp,cookie)		xfs_da_cookie_entry(mp,cookie)
+#else
+#define XFS_DA_COOKIE_ENTRY(mp,cookie) \
+	(((xfs_off_t)(cookie) >> 31) == -1LL ? \
+		(xfs_dablk_t)0 : \
+		(xfs_dablk_t)(((xfs_off_t)(cookie) >> 32) & \
+			      ((1 << (mp)->m_dircook_elog) - 1)))
+#endif
+
+
+/*========================================================================
+ * Btree searching and modification structure definitions.
+ *========================================================================*/
+
+/*
+ * Structure to ease passing around component names.
+ */
+typedef struct xfs_da_args {
+	uchar_t		*name;		/* string (maybe not NULL terminated) */
+	int		namelen;	/* length of string (maybe no NULL) */
+	uchar_t		*value;		/* set of bytes (maybe contain NULLs) */
+	int		valuelen;	/* length of value */
+	int		flags;		/* argument flags (eg: ATTR_NOCREATE) */
+	xfs_dahash_t	hashval;	/* hash value of name */
+	xfs_ino_t	inumber;	/* input/output inode number */
+	struct xfs_inode *dp;		/* directory inode to manipulate */
+	xfs_fsblock_t	*firstblock;	/* ptr to firstblock for bmap calls */
+	struct xfs_bmap_free *flist;	/* ptr to freelist for bmap_finish */
+	struct xfs_trans *trans;	/* current trans (changes over time) */
+	xfs_extlen_t	total;		/* total blocks needed, for 1st bmap */
+	int		whichfork;	/* data or attribute fork */
+	xfs_dablk_t	blkno;		/* blkno of attr leaf of interest */
+	int		index;		/* index of attr of interest in blk */
+	xfs_dablk_t	rmtblkno;	/* remote attr value starting blkno */
+	int		rmtblkcnt;	/* remote attr value block count */
+	int		rename;		/* T/F: this is an atomic rename op */
+	xfs_dablk_t	blkno2;		/* blkno of 2nd attr leaf of interest */
+	int		index2;		/* index of 2nd attr in blk */
+	xfs_dablk_t	rmtblkno2;	/* remote attr value starting blkno */
+	int		rmtblkcnt2;	/* remote attr value block count */
+	int		justcheck;	/* check for ok with no space */
+	int		addname;	/* T/F: this is an add operation */
+	int		oknoent;	/* T/F: ok to return ENOENT, else die */
+} xfs_da_args_t;
+
+/*
+ * Structure to describe buffer(s) for a block.
+ * This is needed in the directory version 2 format case, when
+ * multiple non-contiguous fsblocks might be needed to cover one
+ * logical directory block.
+ * If the buffer count is 1 then the data pointer points to the
+ * same place as the b_addr field for the buffer, else to kmem_alloced memory.
+ */
+typedef struct xfs_dabuf {
+	int		nbuf;		/* number of buffer pointers present */
+	short		dirty;		/* data needs to be copied back */
+	short		bbcount;	/* how large is data in bbs */
+	void		*data;		/* pointer for buffers' data */
+#ifdef XFS_DABUF_DEBUG
+	inst_t		*ra;		/* return address of caller to make */
+	struct xfs_dabuf *next;		/* next in global chain */
+	struct xfs_dabuf *prev;		/* previous in global chain */
+	dev_t		dev;		/* device for buffer */
+	xfs_daddr_t	blkno;		/* daddr first in bps[0] */
+#endif
+	struct xfs_buf	*bps[1];	/* actually nbuf of these */
+} xfs_dabuf_t;
+#define XFS_DA_BUF_SIZE(n)	\
+	(sizeof(xfs_dabuf_t) + sizeof(struct xfs_buf *) * ((n) - 1))
+
+#ifdef XFS_DABUF_DEBUG
+extern xfs_dabuf_t	*xfs_dabuf_global_list;
+#endif
+
+/*
+ * Storage for holding state during Btree searches and split/join ops.
+ *
+ * Only need space for 5 intermediate nodes.  With a minimum of 62-way
+ * fanout to the Btree, we can support over 900 million directory blocks,
+ * which is slightly more than enough.
+ */
+typedef struct xfs_da_state_blk {
+	xfs_dabuf_t	*bp;		/* buffer containing block */
+	xfs_dablk_t	blkno;		/* filesystem blkno of buffer */
+	xfs_daddr_t		disk_blkno;	/* on-disk blkno (in BBs) of buffer */
+	int		index;		/* relevant index into block */
+	xfs_dahash_t	hashval;	/* last hash value in block */
+	int		magic;		/* blk's magic number, ie: blk type */
+} xfs_da_state_blk_t;
+
+typedef struct xfs_da_state_path {
+	int			active;		/* number of active levels */
+	xfs_da_state_blk_t	blk[XFS_DA_NODE_MAXDEPTH];
+} xfs_da_state_path_t;
+
+typedef struct xfs_da_state {
+	xfs_da_args_t		*args;		/* filename arguments */
+	struct xfs_mount	*mp;		/* filesystem mount point */
+	int			blocksize;	/* logical block size */
+	int			inleaf;		/* insert into 1->lf, 0->splf */
+	xfs_da_state_path_t	path;		/* search/split paths */
+	xfs_da_state_path_t	altpath;	/* alternate path for join */
+	int			extravalid;	/* T/F: extrablk is in use */
+	int			extraafter;	/* T/F: extrablk is after new */
+	xfs_da_state_blk_t	extrablk;	/* for double-splits on leafs */
+						/* for dirv2 extrablk is data */
+} xfs_da_state_t;
+
+/*
+ * Utility macros to aid in logging changed structure fields.
+ */
+#define XFS_DA_LOGOFF(BASE, ADDR)	((char *)(ADDR) - (char *)(BASE))
+#define XFS_DA_LOGRANGE(BASE, ADDR, SIZE)	\
+		(uint)(XFS_DA_LOGOFF(BASE, ADDR)), \
+		(uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1)
+
+
+#ifdef __KERNEL__
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+int	xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
+					 xfs_dabuf_t **bpp, int whichfork);
+int	xfs_da_split(xfs_da_state_t *state);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+int	xfs_da_join(xfs_da_state_t *state);
+void	xfs_da_fixhashpath(xfs_da_state_t *state,
+					  xfs_da_state_path_t *path_to_to_fix);
+
+/*
+ * Routines used for finding things in the Btree.
+ */
+int	xfs_da_node_lookup_int(xfs_da_state_t *state, int *result);
+int	xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
+					 int forward, int release, int *result);
+/*
+ * Utility routines.
+ */
+int	xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
+					 xfs_da_state_blk_t *save_blk);
+int	xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
+				       xfs_da_state_blk_t *new_blk);
+
+/*
+ * Utility routines.
+ */
+int	xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
+int	xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+			      xfs_dablk_t bno, xfs_daddr_t mappedbno,
+			      xfs_dabuf_t **bp, int whichfork);
+int	xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+			       xfs_dablk_t bno, xfs_daddr_t mappedbno,
+			       xfs_dabuf_t **bpp, int whichfork);
+xfs_daddr_t	xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
+			xfs_dablk_t bno, int whichfork);
+int	xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
+					  xfs_dabuf_t *dead_buf);
+
+uint xfs_da_hashname(uchar_t *name_string, int name_length);
+uint xfs_da_log2_roundup(uint i);
+xfs_da_state_t *xfs_da_state_alloc(void);
+void xfs_da_state_free(xfs_da_state_t *state);
+void xfs_da_state_kill_altpath(xfs_da_state_t *state);
+
+void xfs_da_buf_done(xfs_dabuf_t *dabuf);
+void xfs_da_log_buf(struct xfs_trans *tp, xfs_dabuf_t *dabuf, uint first,
+			   uint last);
+void xfs_da_brelse(struct xfs_trans *tp, xfs_dabuf_t *dabuf);
+void xfs_da_binval(struct xfs_trans *tp, xfs_dabuf_t *dabuf);
+xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf);
+
+extern struct kmem_zone *xfs_da_state_zone;
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
new file mode 100644
index 000000000000..6978817c63e7
--- /dev/null
+++ b/fs/xfs/xfs_dfrag.c
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <xfs_dfrag.h>
+
+/*
+ * Syssgi interface for swapext
+ */
+int
+xfs_swapext(
+	xfs_swapext_t	*sxp)
+{
+	xfs_swapext_t	sx;
+	xfs_inode_t	*ip=NULL, *tip=NULL, *ips[2];
+	xfs_trans_t	*tp;
+	xfs_mount_t	*mp;
+	xfs_bstat_t	*sbp;
+	struct file	*fp = NULL, *tfp = NULL;
+	vnode_t		*vp, *tvp;
+	bhv_desc_t	*bdp, *tbdp;
+	vn_bhv_head_t	*bhp, *tbhp;
+	uint		lock_flags=0;
+	int		ilf_fields, tilf_fields;
+	int		error = 0;
+	xfs_ifork_t	tempif, *ifp, *tifp;
+	__uint64_t	tmp;
+	int		aforkblks = 0;
+	int		taforkblks = 0;
+	int		locked = 0;
+
+	if (copy_from_user(&sx, sxp, sizeof(sx)))
+		return XFS_ERROR(EFAULT);
+
+	/* Pull information for the target fd */
+	if (((fp = fget((int)sx.sx_fdtarget)) == NULL) ||
+	    ((vp = LINVFS_GET_VP(fp->f_dentry->d_inode)) == NULL))  {
+		error = XFS_ERROR(EINVAL);
+		goto error0;
+	}
+
+	bhp = VN_BHV_HEAD(vp);
+	VN_BHV_READ_LOCK(bhp);
+	bdp = vn_bhv_lookup(bhp, &xfs_vnodeops);
+	if (bdp == NULL) {
+		VN_BHV_READ_UNLOCK(bhp);
+		error = XFS_ERROR(EBADF);
+		goto error0;
+	} else {
+		ip = XFS_BHVTOI(bdp);
+		VN_BHV_READ_UNLOCK(bhp);
+	}
+
+	if (((tfp = fget((int)sx.sx_fdtmp)) == NULL) ||
+	    ((tvp = LINVFS_GET_VP(tfp->f_dentry->d_inode)) == NULL)) {
+		error = XFS_ERROR(EINVAL);
+		goto error0;
+	}
+
+	tbhp = VN_BHV_HEAD(tvp);
+	VN_BHV_READ_LOCK(tbhp);
+	tbdp = vn_bhv_lookup(tbhp, &xfs_vnodeops);
+	if (tbdp == NULL) {
+		VN_BHV_READ_UNLOCK(tbhp);
+		error = XFS_ERROR(EBADF);
+		goto error0;
+	} else {
+		tip = XFS_BHVTOI(tbdp);
+		VN_BHV_READ_UNLOCK(tbhp);
+	}
+
+	if (ip->i_ino == tip->i_ino) {
+		error =	 XFS_ERROR(EINVAL);
+		goto error0;
+	}
+
+	mp = ip->i_mount;
+
+	sbp = &sx.sx_stat;
+
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		error =	 XFS_ERROR(EIO);
+		goto error0;
+	}
+
+	locked = 1;
+
+	/* Lock in i_ino order */
+	if (ip->i_ino < tip->i_ino) {
+		ips[0] = ip;
+		ips[1] = tip;
+	} else {
+		ips[0] = tip;
+		ips[1] = ip;
+	}
+	lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
+	xfs_lock_inodes(ips, 2, 0, lock_flags);
+
+	/* Check permissions */
+	if ((error = _MAC_XFS_IACCESS(ip, MACWRITE, NULL))) {
+		goto error0;
+	}
+	if ((error = _MAC_XFS_IACCESS(tip, MACWRITE, NULL))) {
+		goto error0;
+	}
+	if ((current->fsuid != ip->i_d.di_uid) &&
+	    (error = xfs_iaccess(ip, IWRITE, NULL)) &&
+	    !capable_cred(NULL, CAP_FOWNER)) {
+		goto error0;
+	}
+	if ((current->fsuid != tip->i_d.di_uid) &&
+	    (error = xfs_iaccess(tip, IWRITE, NULL)) &&
+	    !capable_cred(NULL, CAP_FOWNER)) {
+		goto error0;
+	}
+
+	/* Verify both files are either real-time or non-realtime */
+	if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
+	    (tip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
+		error = XFS_ERROR(EINVAL);
+		goto error0;
+	}
+
+	/* Should never get a local format */
+	if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
+	    tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		error = XFS_ERROR(EINVAL);
+		goto error0;
+	}
+
+	if (VN_CACHED(tvp) != 0)
+		xfs_inval_cached_pages(XFS_ITOV(tip), &(tip->i_iocore),
+						(loff_t)0, 0, 0);
+
+	/* Verify O_DIRECT for ftmp */
+	if (VN_CACHED(tvp) != 0) {
+		error = XFS_ERROR(EINVAL);
+		goto error0;
+	}
+
+	/* Verify all data are being swapped */
+	if (sx.sx_offset != 0 ||
+	    sx.sx_length != ip->i_d.di_size ||
+	    sx.sx_length != tip->i_d.di_size) {
+		error = XFS_ERROR(EFAULT);
+		goto error0;
+	}
+
+	/*
+	 * If the target has extended attributes, the tmp file
+	 * must also in order to ensure the correct data fork
+	 * format.
+	 */
+	if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) {
+		error = XFS_ERROR(EINVAL);
+		goto error0;
+	}
+
+	/*
+	 * Compare the current change & modify times with that
+	 * passed in.  If they differ, we abort this swap.
+	 * This is the mechanism used to ensure the calling
+	 * process that the file was not changed out from
+	 * under it.
+	 */
+	if ((sbp->bs_ctime.tv_sec != ip->i_d.di_ctime.t_sec) ||
+	    (sbp->bs_ctime.tv_nsec != ip->i_d.di_ctime.t_nsec) ||
+	    (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) ||
+	    (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) {
+		error = XFS_ERROR(EBUSY);
+		goto error0;
+	}
+
+	/* We need to fail if the file is memory mapped.  Once we have tossed
+	 * all existing pages, the page fault will have no option
+	 * but to go to the filesystem for pages. By making the page fault call
+	 * VOP_READ (or write in the case of autogrow) they block on the iolock
+	 * until we have switched the extents.
+	 */
+	if (VN_MAPPED(vp)) {
+		error = XFS_ERROR(EBUSY);
+		goto error0;
+	}
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	xfs_iunlock(tip, XFS_ILOCK_EXCL);
+
+	/*
+	 * There is a race condition here since we gave up the
+	 * ilock.  However, the data fork will not change since
+	 * we have the iolock (locked for truncation too) so we
+	 * are safe.  We don't really care if non-io related
+	 * fields change.
+	 */
+
+	VOP_TOSS_PAGES(vp, 0, -1, FI_REMAPF);
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
+	if ((error = xfs_trans_reserve(tp, 0,
+				     XFS_ICHANGE_LOG_RES(mp), 0,
+				     0, 0))) {
+		xfs_iunlock(ip,	 XFS_IOLOCK_EXCL);
+		xfs_iunlock(tip, XFS_IOLOCK_EXCL);
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+	xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
+
+	/*
+	 * Count the number of extended attribute blocks
+	 */
+	if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
+	     (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+		error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
+		if (error) {
+			xfs_iunlock(ip,	 lock_flags);
+			xfs_iunlock(tip, lock_flags);
+			xfs_trans_cancel(tp, 0);
+			return error;
+		}
+	}
+	if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
+	     (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
+		error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
+			&taforkblks);
+		if (error) {
+			xfs_iunlock(ip,	 lock_flags);
+			xfs_iunlock(tip, lock_flags);
+			xfs_trans_cancel(tp, 0);
+			return error;
+		}
+	}
+
+	/*
+	 * Swap the data forks of the inodes
+	 */
+	ifp = &ip->i_df;
+	tifp = &tip->i_df;
+	tempif = *ifp;	/* struct copy */
+	*ifp = *tifp;	/* struct copy */
+	*tifp = tempif; /* struct copy */
+
+	/*
+	 * Fix the on-disk inode values
+	 */
+	tmp = (__uint64_t)ip->i_d.di_nblocks;
+	ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
+	tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
+
+	tmp = (__uint64_t) ip->i_d.di_nextents;
+	ip->i_d.di_nextents = tip->i_d.di_nextents;
+	tip->i_d.di_nextents = tmp;
+
+	tmp = (__uint64_t) ip->i_d.di_format;
+	ip->i_d.di_format = tip->i_d.di_format;
+	tip->i_d.di_format = tmp;
+
+	ilf_fields = XFS_ILOG_CORE;
+
+	switch(ip->i_d.di_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		/* If the extents fit in the inode, fix the
+		 * pointer.  Otherwise it's already NULL or
+		 * pointing to the extent.
+		 */
+		if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
+			ifp->if_u1.if_extents =
+				ifp->if_u2.if_inline_ext;
+		}
+		ilf_fields |= XFS_ILOG_DEXT;
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		ilf_fields |= XFS_ILOG_DBROOT;
+		break;
+	}
+
+	tilf_fields = XFS_ILOG_CORE;
+
+	switch(tip->i_d.di_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		/* If the extents fit in the inode, fix the
+		 * pointer.  Otherwise it's already NULL or
+		 * pointing to the extent.
+		 */
+		if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
+			tifp->if_u1.if_extents =
+				tifp->if_u2.if_inline_ext;
+		}
+		tilf_fields |= XFS_ILOG_DEXT;
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		tilf_fields |= XFS_ILOG_DBROOT;
+		break;
+	}
+
+	/*
+	 * Increment vnode ref counts since xfs_trans_commit &
+	 * xfs_trans_cancel will both unlock the inodes and
+	 * decrement the associated ref counts.
+	 */
+	VN_HOLD(vp);
+	VN_HOLD(tvp);
+
+	xfs_trans_ijoin(tp, ip, lock_flags);
+	xfs_trans_ijoin(tp, tip, lock_flags);
+
+	xfs_trans_log_inode(tp, ip,  ilf_fields);
+	xfs_trans_log_inode(tp, tip, tilf_fields);
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * transaction goes to disk before returning to the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC) {
+		xfs_trans_set_sync(tp);
+	}
+
+	error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT, NULL);
+
+	fput(fp);
+	fput(tfp);
+
+	return error;
+
+ error0:
+	if (locked) {
+		xfs_iunlock(ip,	 lock_flags);
+		xfs_iunlock(tip, lock_flags);
+	}
+
+	if (fp != NULL) fput(fp);
+	if (tfp != NULL) fput(tfp);
+
+	return error;
+}
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
new file mode 100644
index 000000000000..7143248ce04c
--- /dev/null
+++ b/fs/xfs/xfs_dfrag.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DFRAG_H__
+#define __XFS_DFRAG_H__
+
+/*
+ * Structure passed to xfs_swapext
+ */
+
+typedef struct xfs_swapext
+{
+	__int64_t	sx_version;	/* version */
+	__int64_t	sx_fdtarget;	/* fd of target file */
+	__int64_t	sx_fdtmp;	/* fd of tmp file */
+	xfs_off_t	sx_offset;	/* offset into file */
+	xfs_off_t	sx_length;	/* leng from offset */
+	char		sx_pad[16];	/* pad space, unused */
+	xfs_bstat_t	sx_stat;	/* stat of target b4 copy */
+} xfs_swapext_t;
+
+/*
+ * Version flag
+ */
+#define XFS_SX_VERSION		0
+
+#ifdef __KERNEL__
+/*
+ * Prototypes for visible xfs_dfrag.c routines.
+ */
+
+/*
+ * Syscall interface for xfs_swapext
+ */
+int	xfs_swapext(struct xfs_swapext *sx);
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_DFRAG_H__ */
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
new file mode 100644
index 000000000000..5adc2eabb423
--- /dev/null
+++ b/fs/xfs/xfs_dinode.h
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DINODE_H__
+#define __XFS_DINODE_H__
+
+struct xfs_buf;
+struct xfs_mount;
+
+#define XFS_DINODE_VERSION_1	1
+#define XFS_DINODE_VERSION_2	2
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DINODE_GOOD_VERSION)
+int xfs_dinode_good_version(int v);
+#define XFS_DINODE_GOOD_VERSION(v)	xfs_dinode_good_version(v)
+#else
+#define XFS_DINODE_GOOD_VERSION(v)	(((v) == XFS_DINODE_VERSION_1) || \
+					 ((v) == XFS_DINODE_VERSION_2))
+#endif
+#define XFS_DINODE_MAGIC	0x494e	/* 'IN' */
+
+/*
+ * Disk inode structure.
+ * This is just the header; the inode is expanded to fill a variable size
+ * with the last field expanding.  It is split into the core and "other"
+ * because we only need the core part in the in-core inode.
+ */
+typedef struct xfs_timestamp {
+	__int32_t	t_sec;		/* timestamp seconds */
+	__int32_t	t_nsec;		/* timestamp nanoseconds */
+} xfs_timestamp_t;
+
+/*
+ * Note: Coordinate changes to this structure with the XFS_DI_* #defines
+ * below and the offsets table in xfs_ialloc_log_di().
+ */
+typedef struct xfs_dinode_core
+{
+	__uint16_t	di_magic;	/* inode magic # = XFS_DINODE_MAGIC */
+	__uint16_t	di_mode;	/* mode and type of file */
+	__int8_t	di_version;	/* inode version */
+	__int8_t	di_format;	/* format of di_c data */
+	__uint16_t	di_onlink;	/* old number of links to file */
+	__uint32_t	di_uid;		/* owner's user id */
+	__uint32_t	di_gid;		/* owner's group id */
+	__uint32_t	di_nlink;	/* number of links to file */
+	__uint16_t	di_projid;	/* owner's project id */
+	__uint8_t	di_pad[10];	/* unused, zeroed space */
+	xfs_timestamp_t di_atime;	/* time last accessed */
+	xfs_timestamp_t di_mtime;	/* time last modified */
+	xfs_timestamp_t di_ctime;	/* time created/inode modified */
+	xfs_fsize_t	di_size;	/* number of bytes in file */
+	xfs_drfsbno_t	di_nblocks;	/* # of direct & btree blocks used */
+	xfs_extlen_t	di_extsize;	/* basic/minimum extent size for file */
+	xfs_extnum_t	di_nextents;	/* number of extents in data fork */
+	xfs_aextnum_t	di_anextents;	/* number of extents in attribute fork*/
+	__uint8_t	di_forkoff;	/* attr fork offs, <<3 for 64b align */
+	__int8_t	di_aformat;	/* format of attr fork's data */
+	__uint32_t	di_dmevmask;	/* DMIG event mask */
+	__uint16_t	di_dmstate;	/* DMIG state info */
+	__uint16_t	di_flags;	/* random flags, XFS_DIFLAG_... */
+	__uint32_t	di_gen;		/* generation number */
+} xfs_dinode_core_t;
+
+typedef struct xfs_dinode
+{
+	xfs_dinode_core_t	di_core;
+	/*
+	 * In adding anything between the core and the union, be
+	 * sure to update the macros like XFS_LITINO below and
+	 * XFS_BMAP_RBLOCK_DSIZE in xfs_bmap_btree.h.
+	 */
+	xfs_agino_t		di_next_unlinked;/* agi unlinked list ptr */
+	union {
+		xfs_bmdr_block_t di_bmbt;	/* btree root block */
+		xfs_bmbt_rec_32_t di_bmx[1];	/* extent list */
+		xfs_dir_shortform_t di_dirsf;	/* shortform directory */
+		xfs_dir2_sf_t	di_dir2sf;	/* shortform directory v2 */
+		char		di_c[1];	/* local contents */
+		xfs_dev_t	di_dev;		/* device for IFCHR/IFBLK */
+		uuid_t		di_muuid;	/* mount point value */
+		char		di_symlink[1];	/* local symbolic link */
+	}		di_u;
+	union {
+		xfs_bmdr_block_t di_abmbt;	/* btree root block */
+		xfs_bmbt_rec_32_t di_abmx[1];	/* extent list */
+		xfs_attr_shortform_t di_attrsf; /* shortform attribute list */
+	}		di_a;
+} xfs_dinode_t;
+
+/*
+ * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
+ * Since the pathconf interface is signed, we use 2^31 - 1 instead.
+ * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
+ */
+#define XFS_MAXLINK		((1U << 31) - 1U)
+#define XFS_MAXLINK_1		65535U
+
+/*
+ * Bit names for logging disk inodes only
+ */
+#define XFS_DI_MAGIC		0x0000001
+#define XFS_DI_MODE		0x0000002
+#define XFS_DI_VERSION		0x0000004
+#define XFS_DI_FORMAT		0x0000008
+#define XFS_DI_ONLINK		0x0000010
+#define XFS_DI_UID		0x0000020
+#define XFS_DI_GID		0x0000040
+#define XFS_DI_NLINK		0x0000080
+#define XFS_DI_PROJID		0x0000100
+#define XFS_DI_PAD		0x0000200
+#define XFS_DI_ATIME		0x0000400
+#define XFS_DI_MTIME		0x0000800
+#define XFS_DI_CTIME		0x0001000
+#define XFS_DI_SIZE		0x0002000
+#define XFS_DI_NBLOCKS		0x0004000
+#define XFS_DI_EXTSIZE		0x0008000
+#define XFS_DI_NEXTENTS		0x0010000
+#define XFS_DI_NAEXTENTS	0x0020000
+#define XFS_DI_FORKOFF		0x0040000
+#define XFS_DI_AFORMAT		0x0080000
+#define XFS_DI_DMEVMASK		0x0100000
+#define XFS_DI_DMSTATE		0x0200000
+#define XFS_DI_FLAGS		0x0400000
+#define XFS_DI_GEN		0x0800000
+#define XFS_DI_NEXT_UNLINKED	0x1000000
+#define XFS_DI_U		0x2000000
+#define XFS_DI_A		0x4000000
+#define XFS_DI_NUM_BITS		27
+#define XFS_DI_ALL_BITS		((1 << XFS_DI_NUM_BITS) - 1)
+#define XFS_DI_CORE_BITS	(XFS_DI_ALL_BITS & ~(XFS_DI_U|XFS_DI_A))
+
+/*
+ * Values for di_format
+ */
+typedef enum xfs_dinode_fmt
+{
+	XFS_DINODE_FMT_DEV,		/* CHR, BLK: di_dev */
+	XFS_DINODE_FMT_LOCAL,		/* DIR, REG: di_c */
+					/* LNK: di_symlink */
+	XFS_DINODE_FMT_EXTENTS,		/* DIR, REG, LNK: di_bmx */
+	XFS_DINODE_FMT_BTREE,		/* DIR, REG, LNK: di_bmbt */
+	XFS_DINODE_FMT_UUID		/* MNT: di_uuid */
+} xfs_dinode_fmt_t;
+
+/*
+ * Inode minimum and maximum sizes.
+ */
+#define XFS_DINODE_MIN_LOG	8
+#define XFS_DINODE_MAX_LOG	11
+#define XFS_DINODE_MIN_SIZE	(1 << XFS_DINODE_MIN_LOG)
+#define XFS_DINODE_MAX_SIZE	(1 << XFS_DINODE_MAX_LOG)
+
+/*
+ * Inode size for given fs.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LITINO)
+int xfs_litino(struct xfs_mount *mp);
+#define XFS_LITINO(mp)		xfs_litino(mp)
+#else
+#define XFS_LITINO(mp)	((mp)->m_litino)
+#endif
+#define XFS_BROOT_SIZE_ADJ	\
+	(sizeof(xfs_bmbt_block_t) - sizeof(xfs_bmdr_block_t))
+
+/*
+ * Fork identifiers.  Here so utilities can use them without including
+ * xfs_inode.h.
+ */
+#define XFS_DATA_FORK	0
+#define XFS_ATTR_FORK	1
+
+/*
+ * Inode data & attribute fork sizes, per inode.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_Q)
+int xfs_cfork_q_arch(xfs_dinode_core_t *dcp, xfs_arch_t arch);
+int xfs_cfork_q(xfs_dinode_core_t *dcp);
+#define XFS_CFORK_Q_ARCH(dcp,arch)	    xfs_cfork_q_arch(dcp,arch)
+#define XFS_CFORK_Q(dcp)		    xfs_cfork_q(dcp)
+#else
+#define XFS_CFORK_Q_ARCH(dcp,arch)	    (!INT_ISZERO((dcp)->di_forkoff, arch))
+#define XFS_CFORK_Q(dcp)		    ((dcp)->di_forkoff != 0)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_BOFF)
+int xfs_cfork_boff_arch(xfs_dinode_core_t *dcp, xfs_arch_t arch);
+int xfs_cfork_boff(xfs_dinode_core_t *dcp);
+#define XFS_CFORK_BOFF_ARCH(dcp,arch)	    xfs_cfork_boff_arch(dcp,arch)
+#define XFS_CFORK_BOFF(dcp)		    xfs_cfork_boff(dcp)
+#else
+#define XFS_CFORK_BOFF_ARCH(dcp,arch)	    ((int)(INT_GET((dcp)->di_forkoff, arch) << 3))
+#define XFS_CFORK_BOFF(dcp)		    ((int)((dcp)->di_forkoff << 3))
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_DSIZE)
+int xfs_cfork_dsize_arch(xfs_dinode_core_t *dcp, struct xfs_mount *mp, xfs_arch_t arch);
+int xfs_cfork_dsize(xfs_dinode_core_t *dcp, struct xfs_mount *mp);
+#define XFS_CFORK_DSIZE_ARCH(dcp,mp,arch)   xfs_cfork_dsize_arch(dcp,mp,arch)
+#define XFS_CFORK_DSIZE(dcp,mp)		    xfs_cfork_dsize(dcp,mp)
+#else
+#define XFS_CFORK_DSIZE_ARCH(dcp,mp,arch) \
+	(XFS_CFORK_Q_ARCH(dcp, arch) ? XFS_CFORK_BOFF_ARCH(dcp, arch) : XFS_LITINO(mp))
+#define XFS_CFORK_DSIZE(dcp,mp) \
+	(XFS_CFORK_Q(dcp) ? XFS_CFORK_BOFF(dcp) : XFS_LITINO(mp))
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_ASIZE)
+int xfs_cfork_asize_arch(xfs_dinode_core_t *dcp, struct xfs_mount *mp, xfs_arch_t arch);
+int xfs_cfork_asize(xfs_dinode_core_t *dcp, struct xfs_mount *mp);
+#define XFS_CFORK_ASIZE_ARCH(dcp,mp,arch)   xfs_cfork_asize_arch(dcp,mp,arch)
+#define XFS_CFORK_ASIZE(dcp,mp)		    xfs_cfork_asize(dcp,mp)
+#else
+#define XFS_CFORK_ASIZE_ARCH(dcp,mp,arch) \
+	(XFS_CFORK_Q_ARCH(dcp, arch) ? XFS_LITINO(mp) - XFS_CFORK_BOFF_ARCH(dcp, arch) : 0)
+#define XFS_CFORK_ASIZE(dcp,mp) \
+	(XFS_CFORK_Q(dcp) ? XFS_LITINO(mp) - XFS_CFORK_BOFF(dcp) : 0)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_SIZE)
+int xfs_cfork_size_arch(xfs_dinode_core_t *dcp, struct xfs_mount *mp, int w, xfs_arch_t arch);
+int xfs_cfork_size(xfs_dinode_core_t *dcp, struct xfs_mount *mp, int w);
+#define XFS_CFORK_SIZE_ARCH(dcp,mp,w,arch)  xfs_cfork_size_arch(dcp,mp,w,arch)
+#define XFS_CFORK_SIZE(dcp,mp,w)	    xfs_cfork_size(dcp,mp,w)
+#else
+#define XFS_CFORK_SIZE_ARCH(dcp,mp,w,arch) \
+	((w) == XFS_DATA_FORK ? \
+		XFS_CFORK_DSIZE_ARCH(dcp, mp, arch) : XFS_CFORK_ASIZE_ARCH(dcp, mp, arch))
+#define XFS_CFORK_SIZE(dcp,mp,w) \
+	((w) == XFS_DATA_FORK ? \
+		XFS_CFORK_DSIZE(dcp, mp) : XFS_CFORK_ASIZE(dcp, mp))
+
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_DSIZE)
+int xfs_dfork_dsize_arch(xfs_dinode_t *dip, struct xfs_mount *mp, xfs_arch_t arch);
+int xfs_dfork_dsize(xfs_dinode_t *dip, struct xfs_mount *mp);
+#define XFS_DFORK_DSIZE_ARCH(dip,mp,arch)   xfs_dfork_dsize_arch(dip,mp,arch)
+#define XFS_DFORK_DSIZE(dip,mp)		    xfs_dfork_dsize(dip,mp)
+#else
+#define XFS_DFORK_DSIZE_ARCH(dip,mp,arch)   XFS_CFORK_DSIZE_ARCH(&(dip)->di_core, mp, arch)
+#define XFS_DFORK_DSIZE(dip,mp)		    XFS_DFORK_DSIZE_ARCH(dip,mp,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_ASIZE)
+int xfs_dfork_asize_arch(xfs_dinode_t *dip, struct xfs_mount *mp, xfs_arch_t arch);
+int xfs_dfork_asize(xfs_dinode_t *dip, struct xfs_mount *mp);
+#define XFS_DFORK_ASIZE_ARCH(dip,mp,arch)   xfs_dfork_asize_arch(dip,mp,arch)
+#define XFS_DFORK_ASIZE(dip,mp)		    xfs_dfork_asize(dip,mp)
+#else
+#define XFS_DFORK_ASIZE_ARCH(dip,mp,arch)   XFS_CFORK_ASIZE_ARCH(&(dip)->di_core, mp, arch)
+#define XFS_DFORK_ASIZE(dip,mp)		    XFS_DFORK_ASIZE_ARCH(dip,mp,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_SIZE)
+int xfs_dfork_size_arch(xfs_dinode_t *dip, struct xfs_mount *mp, int w, xfs_arch_t arch);
+int xfs_dfork_size(xfs_dinode_t *dip, struct xfs_mount *mp, int w);
+#define XFS_DFORK_SIZE_ARCH(dip,mp,w,arch)  xfs_dfork_size_arch(dip,mp,w,arch)
+#define XFS_DFORK_SIZE(dip,mp,w)	    xfs_dfork_size(dip,mp,w)
+#else
+#define XFS_DFORK_SIZE_ARCH(dip,mp,w,arch)  XFS_CFORK_SIZE_ARCH(&(dip)->di_core, mp, w, arch)
+#define XFS_DFORK_SIZE(dip,mp,w)	    XFS_DFORK_SIZE_ARCH(dip,mp,w,ARCH_NOCONVERT)
+
+#endif
+
+/*
+ * Macros for accessing per-fork disk inode information.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_Q)
+int xfs_dfork_q_arch(xfs_dinode_t *dip, xfs_arch_t arch);
+int xfs_dfork_q(xfs_dinode_t *dip);
+#define XFS_DFORK_Q_ARCH(dip,arch)	    xfs_dfork_q_arch(dip,arch)
+#define XFS_DFORK_Q(dip)		    xfs_dfork_q(dip)
+#else
+#define XFS_DFORK_Q_ARCH(dip,arch)	    XFS_CFORK_Q_ARCH(&(dip)->di_core, arch)
+#define XFS_DFORK_Q(dip)		    XFS_DFORK_Q_ARCH(dip,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_BOFF)
+int xfs_dfork_boff_arch(xfs_dinode_t *dip, xfs_arch_t arch);
+int xfs_dfork_boff(xfs_dinode_t *dip);
+#define XFS_DFORK_BOFF_ARCH(dip,arch)	    xfs_dfork_boff_arch(dip,arch)
+#define XFS_DFORK_BOFF(dip)		    xfs_dfork_boff(dip)
+#else
+#define XFS_DFORK_BOFF_ARCH(dip,arch)	    XFS_CFORK_BOFF_ARCH(&(dip)->di_core, arch)
+#define XFS_DFORK_BOFF(dip)		    XFS_DFORK_BOFF_ARCH(dip,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_DPTR)
+char *xfs_dfork_dptr_arch(xfs_dinode_t *dip, xfs_arch_t arch);
+char *xfs_dfork_dptr(xfs_dinode_t *dip);
+#define XFS_DFORK_DPTR_ARCH(dip,arch)	    xfs_dfork_dptr_arch(dip,arch)
+#define XFS_DFORK_DPTR(dip)		    xfs_dfork_dptr(dip)
+#else
+#define XFS_DFORK_DPTR_ARCH(dip,arch)	    ((dip)->di_u.di_c)
+#define XFS_DFORK_DPTR(dip)		    XFS_DFORK_DPTR_ARCH(dip,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_APTR)
+char *xfs_dfork_aptr_arch(xfs_dinode_t *dip, xfs_arch_t arch);
+char *xfs_dfork_aptr(xfs_dinode_t *dip);
+#define XFS_DFORK_APTR_ARCH(dip,arch)	    xfs_dfork_aptr_arch(dip,arch)
+#define XFS_DFORK_APTR(dip)		    xfs_dfork_aptr(dip)
+#else
+#define XFS_DFORK_APTR_ARCH(dip,arch)	    ((dip)->di_u.di_c + XFS_DFORK_BOFF_ARCH(dip, arch))
+#define XFS_DFORK_APTR(dip)		    XFS_DFORK_APTR_ARCH(dip,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_PTR)
+char *xfs_dfork_ptr_arch(xfs_dinode_t *dip, int w, xfs_arch_t arch);
+char *xfs_dfork_ptr(xfs_dinode_t *dip, int w);
+#define XFS_DFORK_PTR_ARCH(dip,w,arch)	    xfs_dfork_ptr_arch(dip,w,arch)
+#define XFS_DFORK_PTR(dip,w)		    xfs_dfork_ptr(dip,w)
+#else
+#define XFS_DFORK_PTR_ARCH(dip,w,arch)	\
+	((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR_ARCH(dip, arch) : XFS_DFORK_APTR_ARCH(dip, arch))
+#define XFS_DFORK_PTR(dip,w)		    XFS_DFORK_PTR_ARCH(dip,w,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_FORMAT)
+int xfs_cfork_format_arch(xfs_dinode_core_t *dcp, int w, xfs_arch_t arch);
+int xfs_cfork_format(xfs_dinode_core_t *dcp, int w);
+#define XFS_CFORK_FORMAT_ARCH(dcp,w,arch)   xfs_cfork_format_arch(dcp,w,arch)
+#define XFS_CFORK_FORMAT(dcp,w)		    xfs_cfork_format(dcp,w)
+#else
+#define XFS_CFORK_FORMAT_ARCH(dcp,w,arch) \
+	((w) == XFS_DATA_FORK ? INT_GET((dcp)->di_format, arch) : INT_GET((dcp)->di_aformat, arch))
+#define XFS_CFORK_FORMAT(dcp,w)		    XFS_CFORK_FORMAT_ARCH(dcp,w,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_FMT_SET)
+void xfs_cfork_fmt_set_arch(xfs_dinode_core_t *dcp, int w, int n, xfs_arch_t arch);
+void xfs_cfork_fmt_set(xfs_dinode_core_t *dcp, int w, int n);
+#define XFS_CFORK_FMT_SET_ARCH(dcp,w,n,arch) xfs_cfork_fmt_set_arch(dcp,w,n,arch)
+#define XFS_CFORK_FMT_SET(dcp,w,n)	     xfs_cfork_fmt_set(dcp,w,n)
+#else
+#define XFS_CFORK_FMT_SET_ARCH(dcp,w,n,arch) \
+	((w) == XFS_DATA_FORK ? \
+		(INT_SET((dcp)->di_format, arch, (n))) : \
+		(INT_SET((dcp)->di_aformat, arch, (n))))
+#define XFS_CFORK_FMT_SET(dcp,w,n)	     XFS_CFORK_FMT_SET_ARCH(dcp,w,n,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_NEXTENTS)
+int xfs_cfork_nextents_arch(xfs_dinode_core_t *dcp, int w, xfs_arch_t arch);
+int xfs_cfork_nextents(xfs_dinode_core_t *dcp, int w);
+#define XFS_CFORK_NEXTENTS_ARCH(dcp,w,arch)  xfs_cfork_nextents_arch(dcp,w,arch)
+#define XFS_CFORK_NEXTENTS(dcp,w)	     xfs_cfork_nextents(dcp,w)
+#else
+#define XFS_CFORK_NEXTENTS_ARCH(dcp,w,arch) \
+	((w) == XFS_DATA_FORK ? INT_GET((dcp)->di_nextents, arch) : INT_GET((dcp)->di_anextents, arch))
+#define XFS_CFORK_NEXTENTS(dcp,w)	     XFS_CFORK_NEXTENTS_ARCH(dcp,w,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_CFORK_NEXT_SET)
+void xfs_cfork_next_set_arch(xfs_dinode_core_t *dcp, int w, int n, xfs_arch_t arch);
+void xfs_cfork_next_set(xfs_dinode_core_t *dcp, int w, int n);
+#define XFS_CFORK_NEXT_SET_ARCH(dcp,w,n,arch)	xfs_cfork_next_set_arch(dcp,w,n,arch)
+#define XFS_CFORK_NEXT_SET(dcp,w,n)		xfs_cfork_next_set(dcp,w,n)
+#else
+#define XFS_CFORK_NEXT_SET_ARCH(dcp,w,n,arch) \
+	((w) == XFS_DATA_FORK ? \
+		(INT_SET((dcp)->di_nextents, arch, (n))) : \
+		(INT_SET((dcp)->di_anextents, arch, (n))))
+#define XFS_CFORK_NEXT_SET(dcp,w,n)		XFS_CFORK_NEXT_SET_ARCH(dcp,w,n,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_FORMAT)
+int xfs_dfork_format_arch(xfs_dinode_t *dip, int w, xfs_arch_t arch);
+int xfs_dfork_format(xfs_dinode_t *dip, int w);
+#define XFS_DFORK_FORMAT_ARCH(dip,w,arch)   xfs_dfork_format_arch(dip,w,arch)
+#define XFS_DFORK_FORMAT(dip,w)		    xfs_dfork_format(dip,w)
+#else
+#define XFS_DFORK_FORMAT_ARCH(dip,w,arch)   XFS_CFORK_FORMAT_ARCH(&(dip)->di_core, w, arch)
+#define XFS_DFORK_FORMAT(dip,w)		    XFS_DFORK_FORMAT_ARCH(dip,w,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_FMT_SET)
+void xfs_dfork_fmt_set_arch(xfs_dinode_t *dip, int w, int n, xfs_arch_t arch);
+void xfs_dfork_fmt_set(xfs_dinode_t *dip, int w, int n);
+#define XFS_DFORK_FMT_SET_ARCH(dip,w,n,arch)	xfs_dfork_fmt_set_arch(dip,w,n,arch)
+#define XFS_DFORK_FMT_SET(dip,w,n)		xfs_dfork_fmt_set(dip,w,n)
+#else
+#define XFS_DFORK_FMT_SET_ARCH(dip,w,n,arch)	XFS_CFORK_FMT_SET_ARCH(&(dip)->di_core, w, n, arch)
+#define XFS_DFORK_FMT_SET(dip,w,n)		XFS_DFORK_FMT_SET_ARCH(dip,w,n,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_NEXTENTS)
+int xfs_dfork_nextents_arch(xfs_dinode_t *dip, int w, xfs_arch_t arch);
+int xfs_dfork_nextents(xfs_dinode_t *dip, int w);
+#define XFS_DFORK_NEXTENTS_ARCH(dip,w,arch) xfs_dfork_nextents_arch(dip,w,arch)
+#define XFS_DFORK_NEXTENTS(dip,w)	    xfs_dfork_nextents(dip,w)
+#else
+#define XFS_DFORK_NEXTENTS_ARCH(dip,w,arch) XFS_CFORK_NEXTENTS_ARCH(&(dip)->di_core, w, arch)
+#define XFS_DFORK_NEXTENTS(dip,w)	    XFS_DFORK_NEXTENTS_ARCH(dip,w,ARCH_NOCONVERT)
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DFORK_NEXT_SET)
+void xfs_dfork_next_set_arch(xfs_dinode_t *dip, int w, int n, xfs_arch_t arch);
+void xfs_dfork_next_set(xfs_dinode_t *dip, int w, int n);
+#define XFS_DFORK_NEXT_SET_ARCH(dip,w,n,arch)	xfs_dfork_next_set_arch(dip,w,n,arch)
+#define XFS_DFORK_NEXT_SET(dip,w,n)		xfs_dfork_next_set(dip,w,n)
+#else
+#define XFS_DFORK_NEXT_SET_ARCH(dip,w,n,arch)	XFS_CFORK_NEXT_SET_ARCH(&(dip)->di_core, w, n, arch)
+#define XFS_DFORK_NEXT_SET(dip,w,n)		XFS_DFORK_NEXT_SET_ARCH(dip,w,n,ARCH_NOCONVERT)
+
+#endif
+
+/*
+ * File types (mode field)
+ */
+#define IFMT		0170000		/* type of file */
+#define IFIFO		0010000		/* named pipe (fifo) */
+#define IFCHR		0020000		/* character special */
+#define IFDIR		0040000		/* directory */
+#define IFBLK		0060000		/* block special */
+#define IFREG		0100000		/* regular */
+#define IFLNK		0120000		/* symbolic link */
+#define IFSOCK		0140000		/* socket */
+#define IFMNT		0160000		/* mount point */
+
+/*
+ * File execution and access modes.
+ */
+#define ISUID		04000		/* set user id on execution */
+#define ISGID		02000		/* set group id on execution */
+#define ISVTX		01000		/* sticky directory */
+#define IREAD		0400		/* read, write, execute permissions */
+#define IWRITE		0200
+#define IEXEC		0100
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_DINODE)
+xfs_dinode_t *xfs_buf_to_dinode(struct xfs_buf *bp);
+#define XFS_BUF_TO_DINODE(bp)	xfs_buf_to_dinode(bp)
+#else
+#define XFS_BUF_TO_DINODE(bp)	((xfs_dinode_t *)(XFS_BUF_PTR(bp)))
+#endif
+
+/*
+ * Values for di_flags
+ * There should be a one-to-one correspondence between these flags and the
+ * XFS_XFLAG_s.
+ */
+#define XFS_DIFLAG_REALTIME_BIT 0	/* file's blocks come from rt area */
+#define XFS_DIFLAG_PREALLOC_BIT 1	/* file space has been preallocated */
+#define XFS_DIFLAG_NEWRTBM_BIT	2	/* for rtbitmap inode, new format */
+#define XFS_DIFLAG_REALTIME	(1 << XFS_DIFLAG_REALTIME_BIT)
+#define XFS_DIFLAG_PREALLOC	(1 << XFS_DIFLAG_PREALLOC_BIT)
+#define XFS_DIFLAG_NEWRTBM	(1 << XFS_DIFLAG_NEWRTBM_BIT)
+#define XFS_DIFLAG_ALL	\
+	(XFS_DIFLAG_REALTIME|XFS_DIFLAG_PREALLOC|XFS_DIFLAG_NEWRTBM)
+
+#endif	/* __XFS_DINODE_H__ */
diff --git a/fs/xfs/xfs_dir.c b/fs/xfs/xfs_dir.c
new file mode 100644
index 000000000000..ea2f5798cf70
--- /dev/null
+++ b/fs/xfs/xfs_dir.c
@@ -0,0 +1,1183 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+/*
+ * xfs_dir.c
+ *
+ * Provide the external interfaces to manage directories.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Functions for the dirops interfaces.
+ */
+static void	xfs_dir_mount(struct xfs_mount *mp);
+
+static int	xfs_dir_isempty(struct xfs_inode *dp);
+
+static int	xfs_dir_init(struct xfs_trans *trans,
+			     struct xfs_inode *dir,
+			     struct xfs_inode *parent_dir);
+
+static int	xfs_dir_createname(struct xfs_trans *trans,
+				   struct xfs_inode *dp,
+				   char *name_string,
+				   int name_len,
+				   xfs_ino_t inode_number,
+				   xfs_fsblock_t *firstblock,
+				   xfs_bmap_free_t *flist,
+				   xfs_extlen_t total);
+
+static int	xfs_dir_lookup(struct xfs_trans *tp,
+			       struct xfs_inode *dp,
+			       char *name_string,
+			       int name_length,
+			       xfs_ino_t *inode_number);
+
+static int	xfs_dir_removename(struct xfs_trans *trans,
+				   struct xfs_inode *dp,
+				   char *name_string,
+				   int name_length,
+				   xfs_ino_t ino,
+				   xfs_fsblock_t *firstblock,
+				   xfs_bmap_free_t *flist,
+				   xfs_extlen_t total);
+
+static int	xfs_dir_getdents(struct xfs_trans *tp,
+				 struct xfs_inode *dp,
+				 struct uio *uiop,
+				 int *eofp);
+
+static int	xfs_dir_replace(struct xfs_trans *tp,
+				struct xfs_inode *dp,
+				char *name_string,
+				int name_length,
+				xfs_ino_t inode_number,
+				xfs_fsblock_t *firstblock,
+				xfs_bmap_free_t *flist,
+				xfs_extlen_t total);
+
+static int	xfs_dir_canenter(struct xfs_trans *tp,
+				 struct xfs_inode *dp,
+				 char *name_string,
+				 int name_length);
+
+static int	xfs_dir_shortform_validate_ondisk(xfs_mount_t *mp,
+						  xfs_dinode_t *dip);
+
+xfs_dirops_t xfsv1_dirops = {
+	.xd_mount			= xfs_dir_mount,
+	.xd_isempty			= xfs_dir_isempty,
+	.xd_init			= xfs_dir_init,
+	.xd_createname			= xfs_dir_createname,
+	.xd_lookup			= xfs_dir_lookup,
+	.xd_removename			= xfs_dir_removename,
+	.xd_getdents			= xfs_dir_getdents,
+	.xd_replace			= xfs_dir_replace,
+	.xd_canenter			= xfs_dir_canenter,
+	.xd_shortform_validate_ondisk	= xfs_dir_shortform_validate_ondisk,
+	.xd_shortform_to_single		= xfs_dir_shortform_to_leaf,
+};
+
+/*
+ * Internal routines when dirsize == XFS_LBSIZE(mp).
+ */
+STATIC int xfs_dir_leaf_lookup(xfs_da_args_t *args);
+STATIC int xfs_dir_leaf_removename(xfs_da_args_t *args, int *number_entries,
+						 int *total_namebytes);
+STATIC int xfs_dir_leaf_getdents(xfs_trans_t *trans, xfs_inode_t *dp,
+					     uio_t *uio, int *eofp,
+					     xfs_dirent_t *dbp,
+					     xfs_dir_put_t put);
+STATIC int xfs_dir_leaf_replace(xfs_da_args_t *args);
+
+/*
+ * Internal routines when dirsize > XFS_LBSIZE(mp).
+ */
+STATIC int xfs_dir_node_addname(xfs_da_args_t *args);
+STATIC int xfs_dir_node_lookup(xfs_da_args_t *args);
+STATIC int xfs_dir_node_removename(xfs_da_args_t *args);
+STATIC int xfs_dir_node_getdents(xfs_trans_t *trans, xfs_inode_t *dp,
+					     uio_t *uio, int *eofp,
+					     xfs_dirent_t *dbp,
+					     xfs_dir_put_t put);
+STATIC int xfs_dir_node_replace(xfs_da_args_t *args);
+
+#if defined(DEBUG)
+ktrace_t *xfs_dir_trace_buf;
+#endif
+
+
+/*========================================================================
+ * Overall external interface routines.
+ *========================================================================*/
+
+xfs_dahash_t	xfs_dir_hash_dot, xfs_dir_hash_dotdot;
+
+/*
+ * One-time startup routine called from xfs_init().
+ */
+void
+xfs_dir_startup(void)
+{
+	xfs_dir_hash_dot = xfs_da_hashname(".", 1);
+	xfs_dir_hash_dotdot = xfs_da_hashname("..", 2);
+}
+
+/*
+ * Initialize directory-related fields in the mount structure.
+ */
+static void
+xfs_dir_mount(xfs_mount_t *mp)
+{
+	uint shortcount, leafcount, count;
+
+	mp->m_dirversion = 1;
+	shortcount = (mp->m_attroffset - (uint)sizeof(xfs_dir_sf_hdr_t)) /
+		     (uint)sizeof(xfs_dir_sf_entry_t);
+	leafcount = (XFS_LBSIZE(mp) - (uint)sizeof(xfs_dir_leaf_hdr_t)) /
+		    ((uint)sizeof(xfs_dir_leaf_entry_t) +
+		     (uint)sizeof(xfs_dir_leaf_name_t));
+	count = shortcount > leafcount ? shortcount : leafcount;
+	mp->m_dircook_elog = xfs_da_log2_roundup(count + 1);
+	ASSERT(mp->m_dircook_elog <= mp->m_sb.sb_blocklog);
+	mp->m_da_node_ents =
+		(XFS_LBSIZE(mp) - (uint)sizeof(xfs_da_node_hdr_t)) /
+		(uint)sizeof(xfs_da_node_entry_t);
+	mp->m_dir_magicpct = (XFS_LBSIZE(mp) * 37) / 100;
+	mp->m_dirblksize = mp->m_sb.sb_blocksize;
+	mp->m_dirblkfsbs = 1;
+}
+
+/*
+ * Return 1 if directory contains only "." and "..".
+ */
+static int
+xfs_dir_isempty(xfs_inode_t *dp)
+{
+	xfs_dir_sf_hdr_t *hdr;
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	if (dp->i_d.di_size == 0)
+		return(1);
+	if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
+		return(0);
+	hdr = (xfs_dir_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	return(hdr->count == 0);
+}
+
+/*
+ * Initialize a directory with its "." and ".." entries.
+ */
+static int
+xfs_dir_init(xfs_trans_t *trans, xfs_inode_t *dir, xfs_inode_t *parent_dir)
+{
+	xfs_da_args_t args;
+	int error;
+
+	bzero((char *)&args, sizeof(args));
+	args.dp = dir;
+	args.trans = trans;
+
+	ASSERT((dir->i_d.di_mode & IFMT) == IFDIR);
+	if ((error = xfs_dir_ino_validate(trans->t_mountp, parent_dir->i_ino)))
+		return error;
+
+	return(xfs_dir_shortform_create(&args, parent_dir->i_ino));
+}
+
+/*
+ * Generic handler routine to add a name to a directory.
+ * Transitions directory from shortform to Btree as necessary.
+ */
+static int							/* error */
+xfs_dir_createname(xfs_trans_t *trans, xfs_inode_t *dp, char *name,
+		   int namelen, xfs_ino_t inum, xfs_fsblock_t *firstblock,
+		   xfs_bmap_free_t *flist, xfs_extlen_t total)
+{
+	xfs_da_args_t args;
+	int retval, newsize, done;
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+
+	if ((retval = xfs_dir_ino_validate(trans->t_mountp, inum)))
+		return (retval);
+
+	XFS_STATS_INC(xfsstats.xs_dir_create);
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	args.name = name;
+	args.namelen = namelen;
+	args.hashval = xfs_da_hashname(name, namelen);
+	args.inumber = inum;
+	args.dp = dp;
+	args.firstblock = firstblock;
+	args.flist = flist;
+	args.total = total;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = trans;
+	args.justcheck = 0;
+	args.addname = args.oknoent = 1;
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	done = 0;
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		newsize = XFS_DIR_SF_ENTSIZE_BYNAME(args.namelen);
+		if ((dp->i_d.di_size + newsize) <= XFS_IFORK_DSIZE(dp)) {
+			retval = xfs_dir_shortform_addname(&args);
+			done = 1;
+		} else {
+			if (total == 0)
+				return XFS_ERROR(ENOSPC);
+			retval = xfs_dir_shortform_to_leaf(&args);
+			done = retval != 0;
+		}
+	}
+	if (!done && xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
+		retval = xfs_dir_leaf_addname(&args);
+		done = retval != ENOSPC;
+		if (!done) {
+			if (total == 0)
+				return XFS_ERROR(ENOSPC);
+			retval = xfs_dir_leaf_to_node(&args);
+			done = retval != 0;
+		}
+	}
+	if (!done) {
+		retval = xfs_dir_node_addname(&args);
+	}
+	return(retval);
+}
+
+/*
+ * Generic handler routine to check if a name can be added to a directory,
+ * without adding any blocks to the directory.
+ */
+static int							/* error */
+xfs_dir_canenter(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen)
+{
+	xfs_da_args_t args;
+	int retval, newsize;
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	args.name = name;
+	args.namelen = namelen;
+	args.hashval = xfs_da_hashname(name, namelen);
+	args.inumber = 0;
+	args.dp = dp;
+	args.firstblock = NULL;
+	args.flist = NULL;
+	args.total = 0;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = trans;
+	args.justcheck = args.addname = args.oknoent = 1;
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		newsize = XFS_DIR_SF_ENTSIZE_BYNAME(args.namelen);
+		if ((dp->i_d.di_size + newsize) <= XFS_IFORK_DSIZE(dp))
+			retval = 0;
+		else
+			retval = XFS_ERROR(ENOSPC);
+	} else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
+		retval = xfs_dir_leaf_addname(&args);
+	} else {
+		retval = xfs_dir_node_addname(&args);
+	}
+	return(retval);
+}
+
+/*
+ * Generic handler routine to remove a name from a directory.
+ * Transitions directory from Btree to shortform as necessary.
+ */
+static int							/* error */
+xfs_dir_removename(xfs_trans_t *trans, xfs_inode_t *dp, char *name,
+		   int namelen, xfs_ino_t ino, xfs_fsblock_t *firstblock,
+		   xfs_bmap_free_t *flist, xfs_extlen_t total)
+{
+	xfs_da_args_t args;
+	int count, totallen, newsize, retval;
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	XFS_STATS_INC(xfsstats.xs_dir_remove);
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	args.name = name;
+	args.namelen = namelen;
+	args.hashval = xfs_da_hashname(name, namelen);
+	args.inumber = ino;
+	args.dp = dp;
+	args.firstblock = firstblock;
+	args.flist = flist;
+	args.total = total;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = trans;
+	args.justcheck = args.addname = args.oknoent = 0;
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		retval = xfs_dir_shortform_removename(&args);
+	} else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
+		retval = xfs_dir_leaf_removename(&args, &count, &totallen);
+		if (retval == 0) {
+			newsize = XFS_DIR_SF_ALLFIT(count, totallen);
+			if (newsize <= XFS_IFORK_DSIZE(dp)) {
+				retval = xfs_dir_leaf_to_shortform(&args);
+			}
+		}
+	} else {
+		retval = xfs_dir_node_removename(&args);
+	}
+	return(retval);
+}
+
+static int							/* error */
+xfs_dir_lookup(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen,
+				   xfs_ino_t *inum)
+{
+	xfs_da_args_t args;
+	int retval;
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	if (namelen >= MAXNAMELEN) {
+		return(XFS_ERROR(EINVAL));
+	}
+
+	XFS_STATS_INC(xfsstats.xs_dir_lookup);
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	args.name = name;
+	args.namelen = namelen;
+	args.hashval = xfs_da_hashname(name, namelen);
+	args.inumber = 0;
+	args.dp = dp;
+	args.firstblock = NULL;
+	args.flist = NULL;
+	args.total = 0;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = trans;
+	args.justcheck = args.addname = 0;
+	args.oknoent = 1;
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		retval = xfs_dir_shortform_lookup(&args);
+	} else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
+		retval = xfs_dir_leaf_lookup(&args);
+	} else {
+		retval = xfs_dir_node_lookup(&args);
+	}
+	if (retval == EEXIST)
+		retval = 0;
+	*inum = args.inumber;
+	return(retval);
+}
+
+/*
+ * Implement readdir.
+ */
+static int							/* error */
+xfs_dir_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio, int *eofp)
+{
+	xfs_dirent_t *dbp;
+	int  alignment, retval;
+	xfs_dir_put_t put;
+
+	XFS_STATS_INC(xfsstats.xs_dir_getdents);
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+
+	/*
+	 * If our caller has given us a single contiguous memory buffer,
+	 * just work directly within that buffer.  If it's in user memory,
+	 * lock it down first.
+	 */
+	alignment = sizeof(xfs_off_t) - 1;
+	if ((uio->uio_iovcnt == 1) &&
+	    (((__psint_t)uio->uio_iov[0].iov_base & alignment) == 0) &&
+	    ((uio->uio_iov[0].iov_len & alignment) == 0)) {
+		dbp = NULL;
+		put = xfs_dir_put_dirent64_direct;
+	} else {
+		dbp = kmem_alloc(sizeof(*dbp) + MAXNAMELEN, KM_SLEEP);
+		put = xfs_dir_put_dirent64_uio;
+	}
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	*eofp = 0;
+
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		retval = xfs_dir_shortform_getdents(dp, uio, eofp, dbp, put);
+	} else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
+		retval = xfs_dir_leaf_getdents(trans, dp, uio, eofp, dbp, put);
+	} else {
+		retval = xfs_dir_node_getdents(trans, dp, uio, eofp, dbp, put);
+	}
+	if (dbp != NULL)
+		kmem_free(dbp, sizeof(*dbp) + MAXNAMELEN);
+
+	return(retval);
+}
+
+static int							/* error */
+xfs_dir_replace(xfs_trans_t *trans, xfs_inode_t *dp, char *name, int namelen,
+				    xfs_ino_t inum, xfs_fsblock_t *firstblock,
+				    xfs_bmap_free_t *flist, xfs_extlen_t total)
+{
+	xfs_da_args_t args;
+	int retval;
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	if (namelen >= MAXNAMELEN) {
+		return(XFS_ERROR(EINVAL));
+	}
+
+	if ((retval = xfs_dir_ino_validate(trans->t_mountp, inum)))
+		return retval;
+
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	args.name = name;
+	args.namelen = namelen;
+	args.hashval = xfs_da_hashname(name, namelen);
+	args.inumber = inum;
+	args.dp = dp;
+	args.firstblock = firstblock;
+	args.flist = flist;
+	args.total = total;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = trans;
+	args.justcheck = args.addname = args.oknoent = 0;
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+		retval = xfs_dir_shortform_replace(&args);
+	} else if (xfs_bmap_one_block(dp, XFS_DATA_FORK)) {
+		retval = xfs_dir_leaf_replace(&args);
+	} else {
+		retval = xfs_dir_node_replace(&args);
+	}
+
+	return(retval);
+}
+
+static int
+xfs_dir_shortform_validate_ondisk(xfs_mount_t *mp, xfs_dinode_t *dp)
+{
+	xfs_ino_t		ino;
+	int			namelen_sum;
+	int			count;
+	xfs_dir_shortform_t	*sf;
+	xfs_dir_sf_entry_t	*sfe;
+	int			i;
+
+
+
+	if ((INT_GET(dp->di_core.di_mode, ARCH_CONVERT) & IFMT) != IFDIR) {
+		return 0;
+	}
+	if (INT_GET(dp->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_LOCAL) {
+		return 0;
+	}
+	if (INT_GET(dp->di_core.di_size, ARCH_CONVERT) < sizeof(sf->hdr)) {
+		xfs_fs_cmn_err(CE_WARN, mp, "Invalid shortform size: dp 0x%p\n",
+			dp);
+		return 1;
+	}
+	sf = (xfs_dir_shortform_t *)(&dp->di_u.di_dirsf);
+	ino = XFS_GET_DIR_INO_ARCH(mp, sf->hdr.parent, ARCH_CONVERT);
+	if (xfs_dir_ino_validate(mp, ino))
+		return 1;
+
+	count = sf->hdr.count;
+	if ((count < 0) || ((count * 10) > XFS_LITINO(mp))) {
+		xfs_fs_cmn_err(CE_WARN, mp,
+			"Invalid shortform count: dp 0x%p\n", dp);
+		return(1);
+	}
+
+	if (count == 0) {
+		return 0;
+	}
+
+	namelen_sum = 0;
+	sfe = &sf->list[0];
+	for (i = sf->hdr.count - 1; i >= 0; i--) {
+		ino = XFS_GET_DIR_INO_ARCH(mp, sfe->inumber, ARCH_CONVERT);
+		xfs_dir_ino_validate(mp, ino);
+		if (sfe->namelen >= XFS_LITINO(mp)) {
+			xfs_fs_cmn_err(CE_WARN, mp,
+				"Invalid shortform namelen: dp 0x%p\n", dp);
+			return 1;
+		}
+		namelen_sum += sfe->namelen;
+		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
+	}
+	if (namelen_sum >= XFS_LITINO(mp)) {
+		xfs_fs_cmn_err(CE_WARN, mp,
+			"Invalid shortform namelen: dp 0x%p\n", dp);
+		return 1;
+	}
+
+	return 0;
+}
+
+/*========================================================================
+ * External routines when dirsize == XFS_LBSIZE(dp->i_mount).
+ *========================================================================*/
+
+/*
+ * Add a name to the leaf directory structure
+ * This is the external routine.
+ */
+int
+xfs_dir_leaf_addname(xfs_da_args_t *args)
+{
+	int index, retval;
+	xfs_dabuf_t *bp;
+
+	retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
+					      XFS_DATA_FORK);
+	if (retval)
+		return(retval);
+	ASSERT(bp != NULL);
+
+	retval = xfs_dir_leaf_lookup_int(bp, args, &index);
+	if (retval == ENOENT)
+		retval = xfs_dir_leaf_add(bp, args, index);
+	xfs_da_buf_done(bp);
+	return(retval);
+}
+
+/*
+ * Remove a name from the leaf directory structure
+ * This is the external routine.
+ */
+STATIC int
+xfs_dir_leaf_removename(xfs_da_args_t *args, int *count, int *totallen)
+{
+	xfs_dir_leafblock_t *leaf;
+	int index, retval;
+	xfs_dabuf_t *bp;
+
+	retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
+					      XFS_DATA_FORK);
+	if (retval)
+		return(retval);
+	ASSERT(bp != NULL);
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	retval = xfs_dir_leaf_lookup_int(bp, args, &index);
+	if (retval == EEXIST) {
+		(void)xfs_dir_leaf_remove(args->trans, bp, index);
+		*count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+		*totallen = INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
+		retval = 0;
+	}
+	xfs_da_buf_done(bp);
+	return(retval);
+}
+
+/*
+ * Look up a name in a leaf directory structure.
+ * This is the external routine.
+ */
+STATIC int
+xfs_dir_leaf_lookup(xfs_da_args_t *args)
+{
+	int index, retval;
+	xfs_dabuf_t *bp;
+
+	retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
+					      XFS_DATA_FORK);
+	if (retval)
+		return(retval);
+	ASSERT(bp != NULL);
+	retval = xfs_dir_leaf_lookup_int(bp, args, &index);
+	xfs_da_brelse(args->trans, bp);
+	return(retval);
+}
+
+/*
+ * Copy out directory entries for getdents(), for leaf directories.
+ */
+STATIC int
+xfs_dir_leaf_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio,
+				  int *eofp, xfs_dirent_t *dbp, xfs_dir_put_t put)
+{
+	xfs_dabuf_t *bp;
+	int retval, eob;
+
+	retval = xfs_da_read_buf(dp->i_transp, dp, 0, -1, &bp, XFS_DATA_FORK);
+	if (retval)
+		return(retval);
+	ASSERT(bp != NULL);
+	retval = xfs_dir_leaf_getdents_int(bp, dp, 0, uio, &eob, dbp, put, -1);
+	xfs_da_brelse(trans, bp);
+	*eofp = (eob == 0);
+	return(retval);
+}
+
+/*
+ * Look up a name in a leaf directory structure, replace the inode number.
+ * This is the external routine.
+ */
+STATIC int
+xfs_dir_leaf_replace(xfs_da_args_t *args)
+{
+	int index, retval;
+	xfs_dabuf_t *bp;
+	xfs_ino_t inum;
+	xfs_dir_leafblock_t *leaf;
+	xfs_dir_leaf_entry_t *entry;
+	xfs_dir_leaf_name_t *namest;
+
+	inum = args->inumber;
+	retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
+					      XFS_DATA_FORK);
+	if (retval)
+		return(retval);
+	ASSERT(bp != NULL);
+	retval = xfs_dir_leaf_lookup_int(bp, args, &index);
+	if (retval == EEXIST) {
+		leaf = bp->data;
+		entry = &leaf->entries[index];
+		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+		/* XXX - replace assert? */
+		XFS_DIR_SF_PUT_DIRINO_ARCH(&inum, &namest->inumber, ARCH_CONVERT);
+		xfs_da_log_buf(args->trans, bp,
+		    XFS_DA_LOGRANGE(leaf, namest, sizeof(namest->inumber)));
+		xfs_da_buf_done(bp);
+		retval = 0;
+	} else
+		xfs_da_brelse(args->trans, bp);
+	return(retval);
+}
+
+
+/*========================================================================
+ * External routines when dirsize > XFS_LBSIZE(mp).
+ *========================================================================*/
+
+/*
+ * Add a name to a Btree-format directory.
+ *
+ * This will involve walking down the Btree, and may involve splitting
+ * leaf nodes and even splitting intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ */
+STATIC int
+xfs_dir_node_addname(xfs_da_args_t *args)
+{
+	xfs_da_state_t *state;
+	xfs_da_state_blk_t *blk;
+	int retval, error;
+
+	/*
+	 * Fill in bucket of arguments/results/context to carry around.
+	 */
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	state->blocksize = state->mp->m_sb.sb_blocksize;
+
+	/*
+	 * Search to see if name already exists, and get back a pointer
+	 * to where it should go.
+	 */
+	error = xfs_da_node_lookup_int(state, &retval);
+	if (error)
+		retval = error;
+	if (retval != ENOENT)
+		goto error;
+	blk = &state->path.blk[ state->path.active-1 ];
+	ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
+	retval = xfs_dir_leaf_add(blk->bp, args, blk->index);
+	if (retval == 0) {
+		/*
+		 * Addition succeeded, update Btree hashvals.
+		 */
+		if (!args->justcheck)
+			xfs_da_fixhashpath(state, &state->path);
+	} else {
+		/*
+		 * Addition failed, split as many Btree elements as required.
+		 */
+		if (args->total == 0) {
+			ASSERT(retval == ENOSPC);
+			goto error;
+		}
+		retval = xfs_da_split(state);
+	}
+error:
+	xfs_da_state_free(state);
+
+	return(retval);
+}
+
+/*
+ * Remove a name from a B-tree directory.
+ *
+ * This will involve walking down the Btree, and may involve joining
+ * leaf nodes and even joining intermediate nodes up to and including
+ * the root node (a special case of an intermediate node).
+ */
+STATIC int
+xfs_dir_node_removename(xfs_da_args_t *args)
+{
+	xfs_da_state_t *state;
+	xfs_da_state_blk_t *blk;
+	int retval, error;
+
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	state->blocksize = state->mp->m_sb.sb_blocksize;
+
+	/*
+	 * Search to see if name exists, and get back a pointer to it.
+	 */
+	error = xfs_da_node_lookup_int(state, &retval);
+	if (error)
+		retval = error;
+	if (retval != EEXIST) {
+		xfs_da_state_free(state);
+		return(retval);
+	}
+
+	/*
+	 * Remove the name and update the hashvals in the tree.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
+	retval = xfs_dir_leaf_remove(args->trans, blk->bp, blk->index);
+	xfs_da_fixhashpath(state, &state->path);
+
+	/*
+	 * Check to see if the tree needs to be collapsed.
+	 */
+	error = 0;
+	if (retval) {
+		error = xfs_da_join(state);
+	}
+
+	xfs_da_state_free(state);
+	if (error)
+		return(error);
+	return(0);
+}
+
+/*
+ * Look up a filename in a int directory.
+ * Use an internal routine to actually do all the work.
+ */
+STATIC int
+xfs_dir_node_lookup(xfs_da_args_t *args)
+{
+	xfs_da_state_t *state;
+	int retval, error, i;
+
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	state->blocksize = state->mp->m_sb.sb_blocksize;
+
+	/*
+	 * Search to see if name exists,
+	 * and get back a pointer to it.
+	 */
+	error = xfs_da_node_lookup_int(state, &retval);
+	if (error) {
+		retval = error;
+	}
+
+	/*
+	 * If not in a transaction, we have to release all the buffers.
+	 */
+	for (i = 0; i < state->path.active; i++) {
+		xfs_da_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+
+	xfs_da_state_free(state);
+	return(retval);
+}
+
+STATIC int
+xfs_dir_node_getdents(xfs_trans_t *trans, xfs_inode_t *dp, uio_t *uio,
+				  int *eofp, xfs_dirent_t *dbp, xfs_dir_put_t put)
+{
+	xfs_da_intnode_t *node;
+	xfs_da_node_entry_t *btree;
+	xfs_dir_leafblock_t *leaf = NULL;
+	xfs_dablk_t bno, nextbno;
+	xfs_dahash_t cookhash;
+	xfs_mount_t *mp;
+	int error, eob, i;
+	xfs_dabuf_t *bp;
+	xfs_daddr_t nextda;
+
+	/*
+	 * Pick up our context.
+	 */
+	mp = dp->i_mount;
+	bp = NULL;
+	bno = XFS_DA_COOKIE_BNO(mp, uio->uio_offset);
+	cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
+
+	xfs_dir_trace_g_du("node: start", dp, uio);
+
+	/*
+	 * Re-find our place, even if we're confused about what our place is.
+	 *
+	 * First we check the block number from the magic cookie, it is a
+	 * cache of where we ended last time.  If we find a leaf block, and
+	 * the starting hashval in that block is less than our desired
+	 * hashval, then we run with it.
+	 */
+	if (bno > 0) {
+		error = xfs_da_read_buf(trans, dp, bno, -1, &bp, XFS_DATA_FORK);
+		if ((error != 0) && (error != EFSCORRUPTED))
+			return(error);
+		if (bp)
+			leaf = bp->data;
+		if (bp && INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) != XFS_DIR_LEAF_MAGIC) {
+			xfs_dir_trace_g_dub("node: block not a leaf",
+						   dp, uio, bno);
+			xfs_da_brelse(trans, bp);
+			bp = NULL;
+		}
+		if (bp && INT_GET(leaf->entries[0].hashval, ARCH_CONVERT) > cookhash) {
+			xfs_dir_trace_g_dub("node: leaf hash too large",
+						   dp, uio, bno);
+			xfs_da_brelse(trans, bp);
+			bp = NULL;
+		}
+		if (bp &&
+		    cookhash > INT_GET(leaf->entries[INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT)) {
+			xfs_dir_trace_g_dub("node: leaf hash too small",
+						   dp, uio, bno);
+			xfs_da_brelse(trans, bp);
+			bp = NULL;
+		}
+	}
+
+	/*
+	 * If we did not find a leaf block from the blockno in the cookie,
+	 * or we there was no blockno in the cookie (eg: first time thru),
+	 * the we start at the top of the Btree and re-find our hashval.
+	 */
+	if (bp == NULL) {
+		xfs_dir_trace_g_du("node: start at root" , dp, uio);
+		bno = 0;
+		for (;;) {
+			error = xfs_da_read_buf(trans, dp, bno, -1, &bp,
+						       XFS_DATA_FORK);
+			if (error)
+				return(error);
+			if (bp == NULL)
+				return(XFS_ERROR(EFSCORRUPTED));
+			node = bp->data;
+			if (INT_GET(node->hdr.info.magic, ARCH_CONVERT) != XFS_DA_NODE_MAGIC)
+				break;
+			btree = &node->btree[0];
+			xfs_dir_trace_g_dun("node: node detail", dp, uio, node);
+			for (i = 0; i < INT_GET(node->hdr.count, ARCH_CONVERT); btree++, i++) {
+				if (INT_GET(btree->hashval, ARCH_CONVERT) >= cookhash) {
+					bno = INT_GET(btree->before, ARCH_CONVERT);
+					break;
+				}
+			}
+			if (i == INT_GET(node->hdr.count, ARCH_CONVERT)) {
+				xfs_da_brelse(trans, bp);
+				xfs_dir_trace_g_du("node: hash beyond EOF",
+							  dp, uio);
+				uio->uio_offset = XFS_DA_MAKE_COOKIE(mp, 0, 0,
+							     XFS_DA_MAXHASH);
+				*eofp = 1;
+				return(0);
+			}
+			xfs_dir_trace_g_dub("node: going to block",
+						   dp, uio, bno);
+			xfs_da_brelse(trans, bp);
+		}
+	}
+	ASSERT(cookhash != XFS_DA_MAXHASH);
+
+	/*
+	 * We've dropped down to the (first) leaf block that contains the
+	 * hashval we are interested in.  Continue rolling upward thru the
+	 * leaf blocks until we fill up our buffer.
+	 */
+	for (;;) {
+		leaf = bp->data;
+		if (INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) != XFS_DIR_LEAF_MAGIC) {
+			xfs_dir_trace_g_dul("node: not a leaf", dp, uio, leaf);
+			xfs_da_brelse(trans, bp);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+		xfs_dir_trace_g_dul("node: leaf detail", dp, uio, leaf);
+		if ((nextbno = INT_GET(leaf->hdr.info.forw, ARCH_CONVERT))) {
+			nextda = xfs_da_reada_buf(trans, dp, nextbno,
+						  XFS_DATA_FORK);
+		} else
+			nextda = -1;
+		error = xfs_dir_leaf_getdents_int(bp, dp, bno, uio, &eob, dbp,
+						  put, nextda);
+		xfs_da_brelse(trans, bp);
+		bno = nextbno;
+		if (eob) {
+			xfs_dir_trace_g_dub("node: E-O-B", dp, uio, bno);
+			*eofp = 0;
+			return(error);
+		}
+		if (bno == 0)
+			break;
+		error = xfs_da_read_buf(trans, dp, bno, nextda, &bp,
+					XFS_DATA_FORK);
+		if (error)
+			return(error);
+		if (bp == NULL)
+			return(XFS_ERROR(EFSCORRUPTED));
+	}
+	*eofp = 1;
+	xfs_dir_trace_g_du("node: E-O-F", dp, uio);
+	return(0);
+}
+
+/*
+ * Look up a filename in an int directory, replace the inode number.
+ * Use an internal routine to actually do the lookup.
+ */
+STATIC int
+xfs_dir_node_replace(xfs_da_args_t *args)
+{
+	xfs_da_state_t *state;
+	xfs_da_state_blk_t *blk;
+	xfs_dir_leafblock_t *leaf;
+	xfs_dir_leaf_entry_t *entry;
+	xfs_dir_leaf_name_t *namest;
+	xfs_ino_t inum;
+	int retval, error, i;
+	xfs_dabuf_t *bp;
+
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	state->blocksize = state->mp->m_sb.sb_blocksize;
+	inum = args->inumber;
+
+	/*
+	 * Search to see if name exists,
+	 * and get back a pointer to it.
+	 */
+	error = xfs_da_node_lookup_int(state, &retval);
+	if (error) {
+		retval = error;
+	}
+
+	if (retval == EEXIST) {
+		blk = &state->path.blk[state->path.active - 1];
+		ASSERT(blk->magic == XFS_DIR_LEAF_MAGIC);
+		bp = blk->bp;
+		leaf = bp->data;
+		entry = &leaf->entries[blk->index];
+		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+		/* XXX - replace assert ? */
+		XFS_DIR_SF_PUT_DIRINO_ARCH(&inum, &namest->inumber, ARCH_CONVERT);
+		xfs_da_log_buf(args->trans, bp,
+		    XFS_DA_LOGRANGE(leaf, namest, sizeof(namest->inumber)));
+		xfs_da_buf_done(bp);
+		blk->bp = NULL;
+		retval = 0;
+	} else {
+		i = state->path.active - 1;
+		xfs_da_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+	for (i = 0; i < state->path.active - 1; i++) {
+		xfs_da_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+
+	xfs_da_state_free(state);
+	return(retval);
+}
+
+#if defined(XFS_DIR_TRACE)
+/*
+ * Add a trace buffer entry for an inode and a uio.
+ */
+void
+xfs_dir_trace_g_du(char *where, xfs_inode_t *dp, uio_t *uio)
+{
+	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DU, where,
+		     (__psunsigned_t)dp, (__psunsigned_t)dp->i_mount,
+		     (__psunsigned_t)(uio->uio_offset >> 32),
+		     (__psunsigned_t)(uio->uio_offset & 0xFFFFFFFF),
+		     (__psunsigned_t)uio->uio_resid,
+		     NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+}
+
+/*
+ * Add a trace buffer entry for an inode and a uio.
+ */
+void
+xfs_dir_trace_g_dub(char *where, xfs_inode_t *dp, uio_t *uio, xfs_dablk_t bno)
+{
+	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUB, where,
+		     (__psunsigned_t)dp, (__psunsigned_t)dp->i_mount,
+		     (__psunsigned_t)(uio->uio_offset >> 32),
+		     (__psunsigned_t)(uio->uio_offset & 0xFFFFFFFF),
+		     (__psunsigned_t)uio->uio_resid,
+		     (__psunsigned_t)bno,
+		     NULL, NULL, NULL, NULL, NULL, NULL);
+}
+
+/*
+ * Add a trace buffer entry for an inode and a uio.
+ */
+void
+xfs_dir_trace_g_dun(char *where, xfs_inode_t *dp, uio_t *uio,
+			xfs_da_intnode_t *node)
+{
+	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUN, where,
+		     (__psunsigned_t)dp, (__psunsigned_t)dp->i_mount,
+		     (__psunsigned_t)(uio->uio_offset >> 32),
+		     (__psunsigned_t)(uio->uio_offset & 0xFFFFFFFF),
+		     (__psunsigned_t)uio->uio_resid,
+		     (__psunsigned_t)INT_GET(node->hdr.info.forw, ARCH_CONVERT),
+		     (__psunsigned_t)INT_GET(node->hdr.count, ARCH_CONVERT),
+		     (__psunsigned_t)INT_GET(node->btree[0].hashval, ARCH_CONVERT),
+		     (__psunsigned_t)INT_GET(node->btree[INT_GET(node->hdr.count, ARCH_CONVERT)-1].hashval, ARCH_CONVERT),
+		     NULL, NULL, NULL);
+}
+
+/*
+ * Add a trace buffer entry for an inode and a uio.
+ */
+void
+xfs_dir_trace_g_dul(char *where, xfs_inode_t *dp, uio_t *uio,
+			xfs_dir_leafblock_t *leaf)
+{
+	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUL, where,
+		     (__psunsigned_t)dp, (__psunsigned_t)dp->i_mount,
+		     (__psunsigned_t)(uio->uio_offset >> 32),
+		     (__psunsigned_t)(uio->uio_offset & 0xFFFFFFFF),
+		     (__psunsigned_t)uio->uio_resid,
+		     (__psunsigned_t)INT_GET(leaf->hdr.info.forw, ARCH_CONVERT),
+		     (__psunsigned_t)INT_GET(leaf->hdr.count, ARCH_CONVERT),
+		     (__psunsigned_t)INT_GET(leaf->entries[0].hashval, ARCH_CONVERT),
+		     (__psunsigned_t)INT_GET(leaf->entries[ INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT),
+		     NULL, NULL, NULL);
+}
+
+/*
+ * Add a trace buffer entry for an inode and a uio.
+ */
+void
+xfs_dir_trace_g_due(char *where, xfs_inode_t *dp, uio_t *uio,
+			xfs_dir_leaf_entry_t *entry)
+{
+	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUE, where,
+		     (__psunsigned_t)dp, (__psunsigned_t)dp->i_mount,
+		     (__psunsigned_t)(uio->uio_offset >> 32),
+		     (__psunsigned_t)(uio->uio_offset & 0xFFFFFFFF),
+		     (__psunsigned_t)uio->uio_resid,
+		     (__psunsigned_t)INT_GET(entry->hashval, ARCH_CONVERT),
+		     NULL, NULL, NULL, NULL, NULL, NULL);
+}
+
+/*
+ * Add a trace buffer entry for an inode and a uio.
+ */
+void
+xfs_dir_trace_g_duc(char *where, xfs_inode_t *dp, uio_t *uio, xfs_off_t cookie)
+{
+	xfs_dir_trace_enter(XFS_DIR_KTRACE_G_DUC, where,
+		     (__psunsigned_t)dp, (__psunsigned_t)dp->i_mount,
+		     (__psunsigned_t)(uio->uio_offset >> 32),
+		     (__psunsigned_t)(uio->uio_offset & 0xFFFFFFFF),
+		     (__psunsigned_t)uio->uio_resid,
+		     (__psunsigned_t)(cookie >> 32),
+		     (__psunsigned_t)(cookie & 0xFFFFFFFF),
+		     NULL, NULL, NULL, NULL, NULL);
+}
+
+/*
+ * Add a trace buffer entry for the arguments given to the routine,
+ * generic form.
+ */
+void
+xfs_dir_trace_enter(int type, char *where,
+			__psunsigned_t a0, __psunsigned_t a1,
+			__psunsigned_t a2, __psunsigned_t a3,
+			__psunsigned_t a4, __psunsigned_t a5,
+			__psunsigned_t a6, __psunsigned_t a7,
+			__psunsigned_t a8, __psunsigned_t a9,
+			__psunsigned_t a10, __psunsigned_t a11)
+{
+	ASSERT(xfs_dir_trace_buf);
+	ktrace_enter(xfs_dir_trace_buf, (void *)((__psunsigned_t)type),
+					(void *)where,
+					(void *)a0, (void *)a1, (void *)a2,
+					(void *)a3, (void *)a4, (void *)a5,
+					(void *)a6, (void *)a7, (void *)a8,
+					(void *)a9, (void *)a10, (void *)a11,
+					NULL, NULL);
+}
+#endif	/* XFS_DIR_TRACE */
diff --git a/fs/xfs/xfs_dir.h b/fs/xfs/xfs_dir.h
new file mode 100644
index 000000000000..73369f65633c
--- /dev/null
+++ b/fs/xfs/xfs_dir.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR_H__
+#define __XFS_DIR_H__
+
+/*
+ * Large directories are structured around Btrees where all the data
+ * elements are in the leaf nodes.  Filenames are hashed into an int,
+ * then that int is used as the index into the Btree.  Since the hashval
+ * of a filename may not be unique, we may have duplicate keys.	 The
+ * internal links in the Btree are logical block offsets into the file.
+ *
+ * Small directories use a different format and are packed as tightly
+ * as possible so as to fit into the literal area of the inode.
+ */
+
+#ifdef XFS_ALL_TRACE
+#define XFS_DIR_TRACE
+#endif
+
+#if !defined(DEBUG)
+#undef XFS_DIR_TRACE
+#endif
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+struct uio;
+struct xfs_bmap_free;
+struct xfs_da_args;
+struct xfs_dinode;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Directory function types.
+ * Put in structures (xfs_dirops_t) for v1 and v2 directories.
+ */
+typedef void	(*xfs_dir_mount_t)(struct xfs_mount *mp);
+typedef int	(*xfs_dir_isempty_t)(struct xfs_inode *dp);
+typedef int	(*xfs_dir_init_t)(struct xfs_trans *tp,
+				  struct xfs_inode *dp,
+				  struct xfs_inode *pdp);
+typedef int	(*xfs_dir_createname_t)(struct xfs_trans *tp,
+					struct xfs_inode *dp,
+					char *name,
+					int namelen,
+					xfs_ino_t inum,
+					xfs_fsblock_t *first,
+					struct xfs_bmap_free *flist,
+					xfs_extlen_t total);
+typedef int	(*xfs_dir_lookup_t)(struct xfs_trans *tp,
+				    struct xfs_inode *dp,
+				    char *name,
+				    int namelen,
+				    xfs_ino_t *inum);
+typedef int	(*xfs_dir_removename_t)(struct xfs_trans *tp,
+					struct xfs_inode *dp,
+					char *name,
+					int namelen,
+					xfs_ino_t ino,
+					xfs_fsblock_t *first,
+					struct xfs_bmap_free *flist,
+					xfs_extlen_t total);
+typedef int	(*xfs_dir_getdents_t)(struct xfs_trans *tp,
+				      struct xfs_inode *dp,
+				      struct uio *uio,
+				      int *eofp);
+typedef int	(*xfs_dir_replace_t)(struct xfs_trans *tp,
+				     struct xfs_inode *dp,
+				     char *name,
+				     int namelen,
+				     xfs_ino_t inum,
+				     xfs_fsblock_t *first,
+				     struct xfs_bmap_free *flist,
+				     xfs_extlen_t total);
+typedef int	(*xfs_dir_canenter_t)(struct xfs_trans *tp,
+				      struct xfs_inode *dp,
+				      char *name,
+				      int namelen);
+typedef int	(*xfs_dir_shortform_validate_ondisk_t)(struct xfs_mount *mp,
+						       struct xfs_dinode *dip);
+typedef int	(*xfs_dir_shortform_to_single_t)(struct xfs_da_args *args);
+
+typedef struct xfs_dirops {
+	xfs_dir_mount_t				xd_mount;
+	xfs_dir_isempty_t			xd_isempty;
+	xfs_dir_init_t				xd_init;
+	xfs_dir_createname_t			xd_createname;
+	xfs_dir_lookup_t			xd_lookup;
+	xfs_dir_removename_t			xd_removename;
+	xfs_dir_getdents_t			xd_getdents;
+	xfs_dir_replace_t			xd_replace;
+	xfs_dir_canenter_t			xd_canenter;
+	xfs_dir_shortform_validate_ondisk_t	xd_shortform_validate_ondisk;
+	xfs_dir_shortform_to_single_t		xd_shortform_to_single;
+} xfs_dirops_t;
+
+/*
+ * Overall external interface routines.
+ */
+void	xfs_dir_startup(void);	/* called exactly once */
+
+#define XFS_DIR_MOUNT(mp)	\
+	((mp)->m_dirops.xd_mount(mp))
+#define XFS_DIR_ISEMPTY(mp,dp)	\
+	((mp)->m_dirops.xd_isempty(dp))
+#define XFS_DIR_INIT(mp,tp,dp,pdp)	\
+	((mp)->m_dirops.xd_init(tp,dp,pdp))
+#define XFS_DIR_CREATENAME(mp,tp,dp,name,namelen,inum,first,flist,total) \
+	((mp)->m_dirops.xd_createname(tp,dp,name,namelen,inum,first,flist,\
+				      total))
+#define XFS_DIR_LOOKUP(mp,tp,dp,name,namelen,inum)	\
+	((mp)->m_dirops.xd_lookup(tp,dp,name,namelen,inum))
+#define XFS_DIR_REMOVENAME(mp,tp,dp,name,namelen,ino,first,flist,total) \
+	((mp)->m_dirops.xd_removename(tp,dp,name,namelen,ino,first,flist,total))
+#define XFS_DIR_GETDENTS(mp,tp,dp,uio,eofp)	\
+	((mp)->m_dirops.xd_getdents(tp,dp,uio,eofp))
+#define XFS_DIR_REPLACE(mp,tp,dp,name,namelen,inum,first,flist,total)	\
+	((mp)->m_dirops.xd_replace(tp,dp,name,namelen,inum,first,flist,total))
+#define XFS_DIR_CANENTER(mp,tp,dp,name,namelen) \
+	((mp)->m_dirops.xd_canenter(tp,dp,name,namelen))
+#define XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp,dip)	\
+	((mp)->m_dirops.xd_shortform_validate_ondisk(mp,dip))
+#define XFS_DIR_SHORTFORM_TO_SINGLE(mp,args)	\
+	((mp)->m_dirops.xd_shortform_to_single(args))
+
+#define XFS_DIR_IS_V1(mp)	((mp)->m_dirversion == 1)
+extern xfs_dirops_t xfsv1_dirops;
+
+#endif	/* __XFS_DIR_H__ */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
new file mode 100644
index 000000000000..1ee94f626cc1
--- /dev/null
+++ b/fs/xfs/xfs_dir2.c
@@ -0,0 +1,830 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * XFS v2 directory implmentation.
+ * Top-level and utility routines.
+ */
+
+#include <xfs.h>
+
+/*
+ * Declarations for interface routines.
+ */
+static void	xfs_dir2_mount(xfs_mount_t *mp);
+static int	xfs_dir2_isempty(xfs_inode_t *dp);
+static int	xfs_dir2_init(xfs_trans_t *tp, xfs_inode_t *dp,
+			      xfs_inode_t *pdp);
+static int	xfs_dir2_createname(xfs_trans_t *tp, xfs_inode_t *dp,
+				    char *name, int namelen, xfs_ino_t inum,
+				    xfs_fsblock_t *first,
+				    xfs_bmap_free_t *flist, xfs_extlen_t total);
+static int	xfs_dir2_lookup(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
+				int namelen, xfs_ino_t *inum);
+static int	xfs_dir2_removename(xfs_trans_t *tp, xfs_inode_t *dp,
+				    char *name, int namelen, xfs_ino_t ino,
+				    xfs_fsblock_t *first,
+				    xfs_bmap_free_t *flist, xfs_extlen_t total);
+static int	xfs_dir2_getdents(xfs_trans_t *tp, xfs_inode_t *dp, uio_t *uio,
+				  int *eofp);
+static int	xfs_dir2_replace(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
+				 int namelen, xfs_ino_t inum,
+				 xfs_fsblock_t *first, xfs_bmap_free_t *flist,
+				 xfs_extlen_t total);
+static int	xfs_dir2_canenter(xfs_trans_t *tp, xfs_inode_t *dp, char *name,
+				  int namelen);
+static int	xfs_dir2_shortform_validate_ondisk(xfs_mount_t *mp,
+						   xfs_dinode_t *dip);
+
+/*
+ * Utility routine declarations.
+ */
+static int	xfs_dir2_put_dirent64_direct(xfs_dir2_put_args_t *pa);
+static int	xfs_dir2_put_dirent64_uio(xfs_dir2_put_args_t *pa);
+
+/*
+ * Directory operations vector.
+ */
+xfs_dirops_t	xfsv2_dirops = {
+	.xd_mount			= xfs_dir2_mount,
+	.xd_isempty			= xfs_dir2_isempty,
+	.xd_init			= xfs_dir2_init,
+	.xd_createname			= xfs_dir2_createname,
+	.xd_lookup			= xfs_dir2_lookup,
+	.xd_removename			= xfs_dir2_removename,
+	.xd_getdents			= xfs_dir2_getdents,
+	.xd_replace			= xfs_dir2_replace,
+	.xd_canenter			= xfs_dir2_canenter,
+	.xd_shortform_validate_ondisk	= xfs_dir2_shortform_validate_ondisk,
+	.xd_shortform_to_single		= xfs_dir2_sf_to_block,
+};
+
+/*
+ * Interface routines.
+ */
+
+/*
+ * Initialize directory-related fields in the mount structure.
+ */
+static void
+xfs_dir2_mount(
+	xfs_mount_t	*mp)		/* filesystem mount point */
+{
+	mp->m_dirversion = 2;
+	ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
+	       XFS_MAX_BLOCKSIZE);
+	mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog);
+	mp->m_dirblkfsbs = 1 << mp->m_sb.sb_dirblklog;
+	mp->m_dirdatablk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_DATA_FIRSTDB(mp));
+	mp->m_dirleafblk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_LEAF_FIRSTDB(mp));
+	mp->m_dirfreeblk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_FREE_FIRSTDB(mp));
+	mp->m_da_node_ents =
+		(mp->m_dirblksize - (uint)sizeof(xfs_da_node_hdr_t)) /
+		(uint)sizeof(xfs_da_node_entry_t);
+	mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100;
+}
+
+/*
+ * Return 1 if directory contains only "." and "..".
+ */
+static int				/* return code */
+xfs_dir2_isempty(
+	xfs_inode_t	*dp)		/* incore inode structure */
+{
+	xfs_dir2_sf_t	*sfp;		/* shortform directory structure */
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	/*
+	 * Might happen during shutdown.
+	 */
+	if (dp->i_d.di_size == 0) {
+		return 1;
+	}
+	if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp))
+		return 0;
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	return INT_ISZERO(sfp->hdr.count, ARCH_CONVERT);
+}
+
+/*
+ * Initialize a directory with its "." and ".." entries.
+ */
+static int				/* error */
+xfs_dir2_init(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*dp,		/* incore directory inode */
+	xfs_inode_t	*pdp)		/* incore parent directory inode */
+{
+	xfs_da_args_t	args;		/* operation arguments */
+	int		error;		/* error return value */
+
+	bzero((char *)&args, sizeof(args));
+	args.dp = dp;
+	args.trans = tp;
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) {
+		return error;
+	}
+	return xfs_dir2_sf_create(&args, pdp->i_ino);
+}
+
+/*
+  Enter a name in a directory.
+ */
+static int					/* error */
+xfs_dir2_createname(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*dp,		/* incore directory inode */
+	char			*name,		/* new entry name */
+	int			namelen,	/* new entry name length */
+	xfs_ino_t		inum,		/* new entry inode number */
+	xfs_fsblock_t		*first,		/* bmap's firstblock */
+	xfs_bmap_free_t		*flist,		/* bmap's freeblock list */
+	xfs_extlen_t		total)		/* bmap's total block count */
+{
+	xfs_da_args_t		args;		/* operation arguments */
+	int			rval;		/* return value */
+	int			v;		/* type-checking value */
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) {
+		return rval;
+	}
+	XFS_STATS_INC(xfsstats.xs_dir_create);
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	args.name = name;
+	args.namelen = namelen;
+	args.hashval = xfs_da_hashname(name, namelen);
+	args.inumber = inum;
+	args.dp = dp;
+	args.firstblock = first;
+	args.flist = flist;
+	args.total = total;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = tp;
+	args.justcheck = 0;
+	args.addname = args.oknoent = 1;
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+		rval = xfs_dir2_sf_addname(&args);
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+		return rval;
+	} else if (v)
+		rval = xfs_dir2_block_addname(&args);
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+		return rval;
+	} else if (v)
+		rval = xfs_dir2_leaf_addname(&args);
+	else
+		rval = xfs_dir2_node_addname(&args);
+	return rval;
+}
+
+/*
+ * Lookup a name in a directory, give back the inode number.
+ */
+static int				/* error */
+xfs_dir2_lookup(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*dp,		/* incore directory inode */
+	char		*name,		/* lookup name */
+	int		namelen,	/* lookup name length */
+	xfs_ino_t	*inum)		/* out: inode number */
+{
+	xfs_da_args_t	args;		/* operation arguments */
+	int		rval;		/* return value */
+	int		v;		/* type-checking value */
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	if (namelen >= MAXNAMELEN) {
+		return XFS_ERROR(EINVAL);
+	}
+	XFS_STATS_INC(xfsstats.xs_dir_lookup);
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	args.name = name;
+	args.namelen = namelen;
+	args.hashval = xfs_da_hashname(name, namelen);
+	args.inumber = 0;
+	args.dp = dp;
+	args.firstblock = NULL;
+	args.flist = NULL;
+	args.total = 0;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = tp;
+	args.justcheck = args.addname = 0;
+	args.oknoent = 1;
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+		rval = xfs_dir2_sf_lookup(&args);
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+		return rval;
+	} else if (v)
+		rval = xfs_dir2_block_lookup(&args);
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+		return rval;
+	} else if (v)
+		rval = xfs_dir2_leaf_lookup(&args);
+	else
+		rval = xfs_dir2_node_lookup(&args);
+	if (rval == EEXIST)
+		rval = 0;
+	if (rval == 0)
+		*inum = args.inumber;
+	return rval;
+}
+
+/*
+ * Remove an entry from a directory.
+ */
+static int				/* error */
+xfs_dir2_removename(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*dp,		/* incore directory inode */
+	char		*name,		/* name of entry to remove */
+	int		namelen,	/* name length of entry to remove */
+	xfs_ino_t	ino,		/* inode number of entry to remove */
+	xfs_fsblock_t	*first,		/* bmap's firstblock */
+	xfs_bmap_free_t *flist,		/* bmap's freeblock list */
+	xfs_extlen_t	total)		/* bmap's total block count */
+{
+	xfs_da_args_t	args;		/* operation arguments */
+	int		rval;		/* return value */
+	int		v;		/* type-checking value */
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	XFS_STATS_INC(xfsstats.xs_dir_remove);
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	args.name = name;
+	args.namelen = namelen;
+	args.hashval = xfs_da_hashname(name, namelen);
+	args.inumber = ino;
+	args.dp = dp;
+	args.firstblock = first;
+	args.flist = flist;
+	args.total = total;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = tp;
+	args.justcheck = args.addname = args.oknoent = 0;
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+		rval = xfs_dir2_sf_removename(&args);
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+		return rval;
+	} else if (v)
+		rval = xfs_dir2_block_removename(&args);
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+		return rval;
+	} else if (v)
+		rval = xfs_dir2_leaf_removename(&args);
+	else
+		rval = xfs_dir2_node_removename(&args);
+	return rval;
+}
+
+/*
+ * Read a directory.
+ */
+static int				/* error */
+xfs_dir2_getdents(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*dp,		/* incore directory inode */
+	uio_t		*uio,		/* caller's buffer control */
+	int		*eofp)		/* out: eof reached */
+{
+	int		alignment;	/* alignment required for ABI */
+	xfs_dirent_t	*dbp;		/* malloc'ed buffer */
+	xfs_dir2_put_t	put;		/* entry formatting routine */
+	int		rval;		/* return value */
+	int		v;		/* type-checking value */
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	XFS_STATS_INC(xfsstats.xs_dir_getdents);
+	/*
+	 * If our caller has given us a single contiguous aligned memory buffer,
+	 * just work directly within that buffer.  If it's in user memory,
+	 * lock it down first.
+	 */
+	alignment = sizeof(xfs_off_t) - 1;
+	if ((uio->uio_iovcnt == 1) &&
+	    (((__psint_t)uio->uio_iov[0].iov_base & alignment) == 0) &&
+	    ((uio->uio_iov[0].iov_len & alignment) == 0)) {
+		dbp = NULL;
+		put = xfs_dir2_put_dirent64_direct;
+	} else {
+		dbp = kmem_alloc(sizeof(*dbp) + MAXNAMELEN, KM_SLEEP);
+		put = xfs_dir2_put_dirent64_uio;
+	}
+
+	*eofp = 0;
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+		rval = xfs_dir2_sf_getdents(dp, uio, eofp, dbp, put);
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+		;
+	} else if (v)
+		rval = xfs_dir2_block_getdents(tp, dp, uio, eofp, dbp, put);
+	else
+		rval = xfs_dir2_leaf_getdents(tp, dp, uio, eofp, dbp, put);
+	if (dbp != NULL)
+		kmem_free(dbp, sizeof(*dbp) + MAXNAMELEN);
+	return rval;
+}
+
+/*
+ * Replace the inode number of a directory entry.
+ */
+static int				/* error */
+xfs_dir2_replace(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*dp,		/* incore directory inode */
+	char		*name,		/* name of entry to replace */
+	int		namelen,	/* name length of entry to replace */
+	xfs_ino_t	inum,		/* new inode number */
+	xfs_fsblock_t	*first,		/* bmap's firstblock */
+	xfs_bmap_free_t *flist,		/* bmap's freeblock list */
+	xfs_extlen_t	total)		/* bmap's total block count */
+{
+	xfs_da_args_t	args;		/* operation arguments */
+	int		rval;		/* return value */
+	int		v;		/* type-checking value */
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	if (namelen >= MAXNAMELEN) {
+		return XFS_ERROR(EINVAL);
+	}
+	if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) {
+		return rval;
+	}
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	args.name = name;
+	args.namelen = namelen;
+	args.hashval = xfs_da_hashname(name, namelen);
+	args.inumber = inum;
+	args.dp = dp;
+	args.firstblock = first;
+	args.flist = flist;
+	args.total = total;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = tp;
+	args.justcheck = args.addname = args.oknoent = 0;
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+		rval = xfs_dir2_sf_replace(&args);
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+		return rval;
+	} else if (v)
+		rval = xfs_dir2_block_replace(&args);
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+		return rval;
+	} else if (v)
+		rval = xfs_dir2_leaf_replace(&args);
+	else
+		rval = xfs_dir2_node_replace(&args);
+	return rval;
+}
+
+/*
+ * See if this entry can be added to the directory without allocating space.
+ */
+static int				/* error */
+xfs_dir2_canenter(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*dp,		/* incore directory inode */
+	char		*name,		/* name of entry to add */
+	int		namelen)	/* name length of entry to add */
+{
+	xfs_da_args_t	args;		/* operation arguments */
+	int		rval;		/* return value */
+	int		v;		/* type-checking value */
+
+	ASSERT((dp->i_d.di_mode & IFMT) == IFDIR);
+	/*
+	 * Fill in the arg structure for this request.
+	 */
+	args.name = name;
+	args.namelen = namelen;
+	args.hashval = xfs_da_hashname(name, namelen);
+	args.inumber = 0;
+	args.dp = dp;
+	args.firstblock = NULL;
+	args.flist = NULL;
+	args.total = 0;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = tp;
+	args.justcheck = args.addname = args.oknoent = 1;
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
+		rval = xfs_dir2_sf_addname(&args);
+	else if ((rval = xfs_dir2_isblock(tp, dp, &v))) {
+		return rval;
+	} else if (v)
+		rval = xfs_dir2_block_addname(&args);
+	else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) {
+		return rval;
+	} else if (v)
+		rval = xfs_dir2_leaf_addname(&args);
+	else
+		rval = xfs_dir2_node_addname(&args);
+	return rval;
+}
+
+/*
+ * Dummy routine for shortform inode validation.
+ * Can't really do this.
+ */
+/* ARGSUSED */
+static int				/* error */
+xfs_dir2_shortform_validate_ondisk(
+	xfs_mount_t	*mp,		/* filesystem mount point */
+	xfs_dinode_t	*dip)		/* ondisk inode */
+{
+	return 0;
+}
+
+/*
+ * Utility routines.
+ */
+
+/*
+ * Add a block to the directory.
+ * This routine is for data and free blocks, not leaf/node blocks
+ * which are handled by xfs_da_grow_inode.
+ */
+int					/* error */
+xfs_dir2_grow_inode(
+	xfs_da_args_t	*args,		/* operation arguments */
+	int		space,		/* v2 dir's space XFS_DIR2_xxx_SPACE */
+	xfs_dir2_db_t	*dbp)		/* out: block number added */
+{
+	xfs_fileoff_t	bno;		/* directory offset of new block */
+	int		count;		/* count of filesystem blocks */
+	xfs_inode_t	*dp;		/* incore directory inode */
+	int		error;		/* error return value */
+	int		got;		/* blocks actually mapped */
+	int		i;		/* temp mapping index */
+	xfs_bmbt_irec_t map;		/* single structure for bmap */
+	int		mapi;		/* mapping index */
+	xfs_bmbt_irec_t *mapp;		/* bmap mapping structure(s) */
+	xfs_mount_t	*mp;		/* filesystem mount point */
+	int		nmap;		/* number of bmap entries */
+	xfs_trans_t	*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args_s("grow_inode", args, space);
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	/*
+	 * Set lowest possible block in the space requested.
+	 */
+	bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE);
+	count = mp->m_dirblkfsbs;
+	/*
+	 * Find the first hole for our block.
+	 */
+	if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, XFS_DATA_FORK))) {
+		return error;
+	}
+	nmap = 1;
+	ASSERT(args->firstblock != NULL);
+	/*
+	 * Try mapping the new block contiguously (one extent).
+	 */
+	if ((error = xfs_bmapi(tp, dp, bno, count,
+			XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
+			args->firstblock, args->total, &map, &nmap,
+			args->flist))) {
+		return error;
+	}
+	ASSERT(nmap <= 1);
+	/*
+	 * Got it in 1.
+	 */
+	if (nmap == 1) {
+		mapp = &map;
+		mapi = 1;
+	}
+	/*
+	 * Didn't work and this is a multiple-fsb directory block.
+	 * Try again with contiguous flag turned on.
+	 */
+	else if (nmap == 0 && count > 1) {
+		xfs_fileoff_t	b;	/* current file offset */
+
+		/*
+		 * Space for maximum number of mappings.
+		 */
+		mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
+		/*
+		 * Iterate until we get to the end of our block.
+		 */
+		for (b = bno, mapi = 0; b < bno + count; ) {
+			int	c;	/* current fsb count */
+
+			/*
+			 * Can't map more than MAX_NMAP at once.
+			 */
+			nmap = MIN(XFS_BMAP_MAX_NMAP, count);
+			c = (int)(bno + count - b);
+			if ((error = xfs_bmapi(tp, dp, b, c,
+					XFS_BMAPI_WRITE|XFS_BMAPI_METADATA,
+					args->firstblock, args->total,
+					&mapp[mapi], &nmap, args->flist))) {
+				kmem_free(mapp, sizeof(*mapp) * count);
+				return error;
+			}
+			if (nmap < 1)
+				break;
+			/*
+			 * Add this bunch into our table, go to the next offset.
+			 */
+			mapi += nmap;
+			b = mapp[mapi - 1].br_startoff +
+			    mapp[mapi - 1].br_blockcount;
+		}
+	}
+	/*
+	 * Didn't work.
+	 */
+	else {
+		mapi = 0;
+		mapp = NULL;
+	}
+	/*
+	 * See how many fsb's we got.
+	 */
+	for (i = 0, got = 0; i < mapi; i++)
+		got += mapp[i].br_blockcount;
+	/*
+	 * Didn't get enough fsb's, or the first/last block's are wrong.
+	 */
+	if (got != count || mapp[0].br_startoff != bno ||
+	    mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
+	    bno + count) {
+		if (mapp != &map)
+			kmem_free(mapp, sizeof(*mapp) * count);
+		return XFS_ERROR(ENOSPC);
+	}
+	/*
+	 * Done with the temporary mapping table.
+	 */
+	if (mapp != &map)
+		kmem_free(mapp, sizeof(*mapp) * count);
+	*dbp = XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)bno);
+	/*
+	 * Update file's size if this is the data space and it grew.
+	 */
+	if (space == XFS_DIR2_DATA_SPACE) {
+		xfs_fsize_t	size;		/* directory file (data) size */
+
+		size = XFS_FSB_TO_B(mp, bno + count);
+		if (size > dp->i_d.di_size) {
+			dp->i_d.di_size = size;
+			xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+		}
+	}
+	return 0;
+}
+
+/*
+ * See if the directory is a single-block form directory.
+ */
+int					/* error */
+xfs_dir2_isblock(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*dp,		/* incore directory inode */
+	int		*vp)		/* out: 1 is block, 0 is not block */
+{
+	xfs_fileoff_t	last;		/* last file offset */
+	xfs_mount_t	*mp;		/* filesystem mount point */
+	int		rval;		/* return value */
+
+	mp = dp->i_mount;
+	if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) {
+		return rval;
+	}
+	rval = XFS_FSB_TO_B(mp, last) == mp->m_dirblksize;
+	ASSERT(rval == 0 || dp->i_d.di_size == mp->m_dirblksize);
+	*vp = rval;
+	return 0;
+}
+
+/*
+ * See if the directory is a single-leaf form directory.
+ */
+int					/* error */
+xfs_dir2_isleaf(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_inode_t	*dp,		/* incore directory inode */
+	int		*vp)		/* out: 1 is leaf, 0 is not leaf */
+{
+	xfs_fileoff_t	last;		/* last file offset */
+	xfs_mount_t	*mp;		/* filesystem mount point */
+	int		rval;		/* return value */
+
+	mp = dp->i_mount;
+	if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) {
+		return rval;
+	}
+	*vp = last == mp->m_dirleafblk + (1 << mp->m_sb.sb_dirblklog);
+	return 0;
+}
+
+/*
+ * Getdents put routine for 64-bit ABI, direct form.
+ */
+static int					/* error */
+xfs_dir2_put_dirent64_direct(
+	xfs_dir2_put_args_t	*pa)		/* argument bundle */
+{
+	xfs_dirent_t		*idbp;		/* dirent pointer */
+	iovec_t			*iovp;		/* io vector */
+	int			namelen;	/* entry name length */
+	int			reclen;		/* entry total length */
+	uio_t			*uio;		/* I/O control */
+
+	namelen = pa->namelen;
+	reclen = DIRENTSIZE(namelen);
+	uio = pa->uio;
+	/*
+	 * Won't fit in the remaining space.
+	 */
+	if (reclen > uio->uio_resid) {
+		pa->done = 0;
+		return 0;
+	}
+	iovp = uio->uio_iov;
+	idbp = (xfs_dirent_t *)iovp->iov_base;
+	iovp->iov_base = (char *)idbp + reclen;
+	iovp->iov_len -= reclen;
+	uio->uio_resid -= reclen;
+	idbp->d_reclen = reclen;
+	idbp->d_ino = pa->ino;
+	idbp->d_off = pa->cook;
+	idbp->d_name[namelen] = '\0';
+	pa->done = 1;
+	bcopy(pa->name, idbp->d_name, namelen);
+	return 0;
+}
+
+/*
+ * Getdents put routine for 64-bit ABI, uio form.
+ */
+static int					/* error */
+xfs_dir2_put_dirent64_uio(
+	xfs_dir2_put_args_t	*pa)		/* argument bundle */
+{
+	xfs_dirent_t		*idbp;		/* dirent pointer */
+	int			namelen;	/* entry name length */
+	int			reclen;		/* entry total length */
+	int			rval;		/* return value */
+	uio_t			*uio;		/* I/O control */
+
+	namelen = pa->namelen;
+	reclen = DIRENTSIZE(namelen);
+	uio = pa->uio;
+	/*
+	 * Won't fit in the remaining space.
+	 */
+	if (reclen > uio->uio_resid) {
+		pa->done = 0;
+		return 0;
+	}
+	idbp = pa->dbp;
+	idbp->d_reclen = reclen;
+	idbp->d_ino = pa->ino;
+	idbp->d_off = pa->cook;
+	idbp->d_name[namelen] = '\0';
+	bcopy(pa->name, idbp->d_name, namelen);
+	rval = uiomove((caddr_t)idbp, reclen, UIO_READ, uio);
+	pa->done = (rval == 0);
+	return rval;
+}
+
+/*
+ * Remove the given block from the directory.
+ * This routine is used for data and free blocks, leaf/node are done
+ * by xfs_da_shrink_inode.
+ */
+int
+xfs_dir2_shrink_inode(
+	xfs_da_args_t	*args,		/* operation arguments */
+	xfs_dir2_db_t	db,		/* directory block number */
+	xfs_dabuf_t	*bp)		/* block's buffer */
+{
+	xfs_fileoff_t	bno;		/* directory file offset */
+	xfs_dablk_t	da;		/* directory file offset */
+	int		done;		/* bunmap is finished */
+	xfs_inode_t	*dp;		/* incore directory inode */
+	int		error;		/* error return value */
+	xfs_mount_t	*mp;		/* filesystem mount point */
+	xfs_trans_t	*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args_db("shrink_inode", args, db, bp);
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	da = XFS_DIR2_DB_TO_DA(mp, db);
+	/*
+	 * Unmap the fsblock(s).
+	 */
+	if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs,
+			XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
+			&done))) {
+		/*
+		 * ENOSPC actually can happen if we're in a removename with
+		 * no space reservation, and the resulting block removal
+		 * would cause a bmap btree split or conversion from extents
+		 * to btree.  This can only happen for un-fragmented
+		 * directory blocks, since you need to be punching out
+		 * the middle of an extent.
+		 * In this case we need to leave the block in the file,
+		 * and not binval it.
+		 * So the block has to be in a consistent empty state
+		 * and appropriately logged.
+		 * We don't free up the buffer, the caller can tell it
+		 * hasn't happened since it got an error back.
+		 */
+		return error;
+	}
+	ASSERT(done);
+	/*
+	 * Invalidate the buffer from the transaction.
+	 */
+	xfs_da_binval(tp, bp);
+	/*
+	 * If it's not a data block, we're done.
+	 */
+	if (db >= XFS_DIR2_LEAF_FIRSTDB(mp))
+		return 0;
+	/*
+	 * If the block isn't the last one in the directory, we're done.
+	 */
+	if (dp->i_d.di_size > XFS_DIR2_DB_OFF_TO_BYTE(mp, db + 1, 0))
+		return 0;
+	bno = da;
+	if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) {
+		/*
+		 * This can't really happen unless there's kernel corruption.
+		 */
+		return error;
+	}
+	if (db == mp->m_dirdatablk)
+		ASSERT(bno == 0);
+	else
+		ASSERT(bno > 0);
+	/*
+	 * Set the size to the new last block.
+	 */
+	dp->i_d.di_size = XFS_FSB_TO_B(mp, bno);
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+	return 0;
+}
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
new file mode 100644
index 000000000000..b40378857536
--- /dev/null
+++ b/fs/xfs/xfs_dir2.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR2_H__
+#define __XFS_DIR2_H__
+
+struct dirent;
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_dir2_put_args;
+struct xfs_inode;
+struct xfs_trans;
+
+/*
+ * Directory version 2.
+ * There are 4 possible formats:
+ *	shortform
+ *	single block - data with embedded leaf at the end
+ *	multiple data blocks, single leaf+freeindex block
+ *	data blocks, node&leaf blocks (btree), freeindex blocks
+ *
+ *	The shortform format is in xfs_dir2_sf.h.
+ *	The single block format is in xfs_dir2_block.h.
+ *	The data block format is in xfs_dir2_data.h.
+ *	The leaf and freeindex block formats are in xfs_dir2_leaf.h.
+ *	Node blocks are the same as the other version, in xfs_da_btree.h.
+ */
+
+/*
+ * Byte offset in data block and shortform entry.
+ */
+typedef __uint16_t	xfs_dir2_data_off_t;
+#define NULLDATAOFF	0xffffU
+typedef uint		xfs_dir2_data_aoff_t;	/* argument form */
+
+/*
+ * Directory block number (logical dirblk in file)
+ */
+typedef __uint32_t	xfs_dir2_db_t;
+
+/*
+ * Byte offset in a directory.
+ */
+typedef xfs_off_t		xfs_dir2_off_t;
+
+/*
+ * For getdents, argument struct for put routines.
+ */
+typedef int (*xfs_dir2_put_t)(struct xfs_dir2_put_args *pa);
+typedef struct xfs_dir2_put_args {
+	xfs_off_t		cook;		/* cookie of (next) entry */
+	xfs_intino_t	ino;		/* inode number */
+	struct xfs_dirent	*dbp;		/* buffer pointer */
+	char		*name;		/* directory entry name */
+	int		namelen;	/* length of name */
+	int		done;		/* output: set if value was stored */
+	xfs_dir2_put_t	put;		/* put function ptr (i/o) */
+	struct uio	*uio;		/* uio control structure */
+} xfs_dir2_put_args_t;
+
+#define XFS_DIR_IS_V2(mp)	((mp)->m_dirversion == 2)
+extern xfs_dirops_t	xfsv2_dirops;
+
+/*
+ * Other interfaces used by the rest of the dir v2 code.
+ */
+extern int
+	xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
+			    xfs_dir2_db_t *dbp);
+
+extern int
+	xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *vp);
+
+extern int
+	xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *vp);
+
+extern int
+	xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
+			      struct xfs_dabuf *bp);
+
+#endif	/* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
new file mode 100644
index 000000000000..de56814d5d25
--- /dev/null
+++ b/fs/xfs/xfs_dir2_block.c
@@ -0,0 +1,1231 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * xfs_dir2_block.c
+ * XFS V2 directory implementation, single-block form.
+ * See xfs_dir2_block.h for the format.
+ */
+
+#include <xfs.h>
+
+
+/*
+ * Local function prototypes.
+ */
+static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, xfs_dabuf_t *bp, int first,
+				    int last);
+static void xfs_dir2_block_log_tail(xfs_trans_t *tp, xfs_dabuf_t *bp);
+static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **bpp,
+				     int *entno);
+static int xfs_dir2_block_sort(const void *a, const void *b);
+
+/*
+ * Add an entry to a block directory.
+ */
+int						/* error */
+xfs_dir2_block_addname(
+	xfs_da_args_t		*args)		/* directory op arguments */
+{
+	xfs_dir2_data_free_t	*bf;		/* bestfree table in block */
+	xfs_dir2_block_t	*block;		/* directory block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	xfs_dabuf_t		*bp;		/* buffer for block */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	int			compact;	/* need to compact leaf ents */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* block unused entry */
+	int			error;		/* error return value */
+	xfs_dir2_data_unused_t	*enddup=NULL;	/* unused at end of data */
+	xfs_dahash_t		hash;		/* hash value of found entry */
+	int			high;		/* high index for binary srch */
+	int			highstale;	/* high stale index */
+	int			lfloghigh=0;	/* last final leaf to log */
+	int			lfloglow=0;	/* first final leaf to log */
+	int			len;		/* length of the new entry */
+	int			low;		/* low index for binary srch */
+	int			lowstale;	/* low stale index */
+	int			mid=0;		/* midpoint for binary srch */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log header */
+	int			needscan;	/* need to rescan freespace */
+	xfs_dir2_data_off_t	*tagp;		/* pointer to tag value */
+	xfs_trans_t		*tp;		/* transaction structure */
+
+	xfs_dir2_trace_args("block_addname", args);
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	/*
+	 * Read the (one and only) directory block into dabuf bp.
+	 */
+	if ((error =
+	    xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
+		return error;
+	}
+	ASSERT(bp != NULL);
+	block = bp->data;
+	/*
+	 * Check the magic number, corrupted if wrong.
+	 */
+	if (INT_GET(block->hdr.magic, ARCH_CONVERT) != XFS_DIR2_BLOCK_MAGIC) {
+		xfs_da_brelse(tp, bp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	len = XFS_DIR2_DATA_ENTSIZE(args->namelen);
+	/*
+	 * Set up pointers to parts of the block.
+	 */
+	bf = block->hdr.bestfree;
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	/*
+	 * No stale entries?  Need space for entry and new leaf.
+	 */
+	if (INT_ISZERO(btp->stale, ARCH_CONVERT)) {
+		/*
+		 * Tag just before the first leaf entry.
+		 */
+		tagp = (xfs_dir2_data_off_t *)blp - 1;
+		/*
+		 * Data object just before the first leaf entry.
+		 */
+		enddup = (xfs_dir2_data_unused_t *)((char *)block + INT_GET(*tagp, ARCH_CONVERT));
+		/*
+		 * If it's not free then can't do this add without cleaning up:
+		 * the space before the first leaf entry needs to be free so it
+		 * can be expanded to hold the pointer to the new entry.
+		 */
+		if (INT_GET(enddup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG)
+			dup = enddup = NULL;
+		/*
+		 * Check out the biggest freespace and see if it's the same one.
+		 */
+		else {
+			dup = (xfs_dir2_data_unused_t *)
+			      ((char *)block + INT_GET(bf[0].offset, ARCH_CONVERT));
+			if (dup == enddup) {
+				/*
+				 * It is the biggest freespace, is it too small
+				 * to hold the new leaf too?
+				 */
+				if (INT_GET(dup->length, ARCH_CONVERT) < len + (uint)sizeof(*blp)) {
+					/*
+					 * Yes, we use the second-largest
+					 * entry instead if it works.
+					 */
+					if (INT_GET(bf[1].length, ARCH_CONVERT) >= len)
+						dup = (xfs_dir2_data_unused_t *)
+						      ((char *)block +
+						       INT_GET(bf[1].offset, ARCH_CONVERT));
+					else
+						dup = NULL;
+				}
+			} else {
+				/*
+				 * Not the same free entry,
+				 * just check its length.
+				 */
+				if (INT_GET(dup->length, ARCH_CONVERT) < len) {
+					dup = NULL;
+				}
+			}
+		}
+		compact = 0;
+	}
+	/*
+	 * If there are stale entries we'll use one for the leaf.
+	 * Is the biggest entry enough to avoid compaction?
+	 */
+	else if (INT_GET(bf[0].length, ARCH_CONVERT) >= len) {
+		dup = (xfs_dir2_data_unused_t *)
+		      ((char *)block + INT_GET(bf[0].offset, ARCH_CONVERT));
+		compact = 0;
+	}
+	/*
+	 * Will need to compact to make this work.
+	 */
+	else {
+		/*
+		 * Tag just before the first leaf entry.
+		 */
+		tagp = (xfs_dir2_data_off_t *)blp - 1;
+		/*
+		 * Data object just before the first leaf entry.
+		 */
+		dup = (xfs_dir2_data_unused_t *)((char *)block + INT_GET(*tagp, ARCH_CONVERT));
+		/*
+		 * If it's not free then the data will go where the
+		 * leaf data starts now, if it works at all.
+		 */
+		if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
+			if (INT_GET(dup->length, ARCH_CONVERT) + (INT_GET(btp->stale, ARCH_CONVERT) - 1) *
+			    (uint)sizeof(*blp) < len)
+				dup = NULL;
+		} else if ((INT_GET(btp->stale, ARCH_CONVERT) - 1) * (uint)sizeof(*blp) < len)
+			dup = NULL;
+		else
+			dup = (xfs_dir2_data_unused_t *)blp;
+		compact = 1;
+	}
+	/*
+	 * If this isn't a real add, we're done with the buffer.
+	 */
+	if (args->justcheck)
+		xfs_da_brelse(tp, bp);
+	/*
+	 * If we don't have space for the new entry & leaf ...
+	 */
+	if (!dup) {
+		/*
+		 * Not trying to actually do anything, or don't have
+		 * a space reservation: return no-space.
+		 */
+		if (args->justcheck || args->total == 0)
+			return XFS_ERROR(ENOSPC);
+		/*
+		 * Convert to the next larger format.
+		 * Then add the new entry in that format.
+		 */
+		error = xfs_dir2_block_to_leaf(args, bp);
+		xfs_da_buf_done(bp);
+		if (error)
+			return error;
+		return xfs_dir2_leaf_addname(args);
+	}
+	/*
+	 * Just checking, and it would work, so say so.
+	 */
+	if (args->justcheck)
+		return 0;
+	needlog = needscan = 0;
+	/*
+	 * If need to compact the leaf entries, do it now.
+	 * Leave the highest-numbered stale entry stale.
+	 * XXX should be the one closest to mid but mid is not yet computed.
+	 */
+	if (compact) {
+		int	fromidx;		/* source leaf index */
+		int	toidx;			/* target leaf index */
+
+		for (fromidx = toidx = INT_GET(btp->count, ARCH_CONVERT) - 1,
+			highstale = lfloghigh = -1;
+		     fromidx >= 0;
+		     fromidx--) {
+			if (INT_GET(blp[fromidx].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR) {
+				if (highstale == -1)
+					highstale = toidx;
+				else {
+					if (lfloghigh == -1)
+						lfloghigh = toidx;
+					continue;
+				}
+			}
+			if (fromidx < toidx)
+				blp[toidx] = blp[fromidx];
+			toidx--;
+		}
+		lfloglow = toidx + 1 - (INT_GET(btp->stale, ARCH_CONVERT) - 1);
+		lfloghigh -= INT_GET(btp->stale, ARCH_CONVERT) - 1;
+		INT_MOD(btp->count, ARCH_CONVERT, -(INT_GET(btp->stale, ARCH_CONVERT) - 1));
+		xfs_dir2_data_make_free(tp, bp,
+			(xfs_dir2_data_aoff_t)((char *)blp - (char *)block),
+			(xfs_dir2_data_aoff_t)((INT_GET(btp->stale, ARCH_CONVERT) - 1) * sizeof(*blp)),
+			&needlog, &needscan);
+		blp += INT_GET(btp->stale, ARCH_CONVERT) - 1;
+		INT_SET(btp->stale, ARCH_CONVERT, 1);
+		/*
+		 * If we now need to rebuild the bestfree map, do so.
+		 * This needs to happen before the next call to use_free.
+		 */
+		if (needscan) {
+			xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block,
+				&needlog, NULL);
+			needscan = 0;
+		}
+	}
+	/*
+	 * Set leaf logging boundaries to impossible state.
+	 * For the no-stale case they're set explicitly.
+	 */
+	else if (INT_GET(btp->stale, ARCH_CONVERT)) {
+		lfloglow = INT_GET(btp->count, ARCH_CONVERT);
+		lfloghigh = -1;
+	}
+	/*
+	 * Find the slot that's first lower than our hash value, -1 if none.
+	 */
+	for (low = 0, high = INT_GET(btp->count, ARCH_CONVERT) - 1; low <= high; ) {
+		mid = (low + high) >> 1;
+		if ((hash = INT_GET(blp[mid].hashval, ARCH_CONVERT)) == args->hashval)
+			break;
+		if (hash < args->hashval)
+			low = mid + 1;
+		else
+			high = mid - 1;
+	}
+	while (mid >= 0 && INT_GET(blp[mid].hashval, ARCH_CONVERT) >= args->hashval) {
+		mid--;
+	}
+	/*
+	 * No stale entries, will use enddup space to hold new leaf.
+	 */
+	if (INT_ISZERO(btp->stale, ARCH_CONVERT)) {
+		/*
+		 * Mark the space needed for the new leaf entry, now in use.
+		 */
+		xfs_dir2_data_use_free(tp, bp, enddup,
+			(xfs_dir2_data_aoff_t)
+			((char *)enddup - (char *)block + INT_GET(enddup->length, ARCH_CONVERT) -
+			 sizeof(*blp)),
+			(xfs_dir2_data_aoff_t)sizeof(*blp),
+			&needlog, &needscan);
+		/*
+		 * Update the tail (entry count).
+		 */
+		INT_MOD(btp->count, ARCH_CONVERT, +1);
+		/*
+		 * If we now need to rebuild the bestfree map, do so.
+		 * This needs to happen before the next call to use_free.
+		 */
+		if (needscan) {
+			xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block,
+				&needlog, NULL);
+			needscan = 0;
+		}
+		/*
+		 * Adjust pointer to the first leaf entry, we're about to move
+		 * the table up one to open up space for the new leaf entry.
+		 * Then adjust our index to match.
+		 */
+		blp--;
+		mid++;
+		if (mid)
+			ovbcopy(&blp[1], blp, mid * sizeof(*blp));
+		lfloglow = 0;
+		lfloghigh = mid;
+	}
+	/*
+	 * Use a stale leaf for our new entry.
+	 */
+	else {
+		for (lowstale = mid;
+		     lowstale >= 0 &&
+			INT_GET(blp[lowstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR;
+		     lowstale--)
+			continue;
+		for (highstale = mid + 1;
+		     highstale < INT_GET(btp->count, ARCH_CONVERT) &&
+			INT_GET(blp[highstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR &&
+			(lowstale < 0 || mid - lowstale > highstale - mid);
+		     highstale++)
+			continue;
+		/*
+		 * Move entries toward the low-numbered stale entry.
+		 */
+		if (lowstale >= 0 &&
+		    (highstale == INT_GET(btp->count, ARCH_CONVERT) ||
+		     mid - lowstale <= highstale - mid)) {
+			if (mid - lowstale)
+				ovbcopy(&blp[lowstale + 1], &blp[lowstale],
+					(mid - lowstale) * sizeof(*blp));
+			lfloglow = MIN(lowstale, lfloglow);
+			lfloghigh = MAX(mid, lfloghigh);
+		}
+		/*
+		 * Move entries toward the high-numbered stale entry.
+		 */
+		else {
+			ASSERT(highstale < INT_GET(btp->count, ARCH_CONVERT));
+			mid++;
+			if (highstale - mid)
+				ovbcopy(&blp[mid], &blp[mid + 1],
+					(highstale - mid) * sizeof(*blp));
+			lfloglow = MIN(mid, lfloglow);
+			lfloghigh = MAX(highstale, lfloghigh);
+		}
+		INT_MOD(btp->stale, ARCH_CONVERT, -1);
+	}
+	/*
+	 * Point to the new data entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)dup;
+	/*
+	 * Fill in the leaf entry.
+	 */
+	INT_SET(blp[mid].hashval, ARCH_CONVERT, args->hashval);
+	INT_SET(blp[mid].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp, (char *)dep - (char *)block));
+	xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh);
+	/*
+	 * Mark space for the data entry used.
+	 */
+	xfs_dir2_data_use_free(tp, bp, dup,
+		(xfs_dir2_data_aoff_t)((char *)dup - (char *)block),
+		(xfs_dir2_data_aoff_t)len, &needlog, &needscan);
+	/*
+	 * Create the new data entry.
+	 */
+	INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+	dep->namelen = args->namelen;
+	bcopy(args->name, dep->name, args->namelen);
+	tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
+	INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block));
+	/*
+	 * Clean up the bestfree array and log the header, tail, and entry.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
+			NULL);
+	if (needlog)
+		xfs_dir2_data_log_header(tp, bp);
+	xfs_dir2_block_log_tail(tp, bp);
+	xfs_dir2_data_log_entry(tp, bp, dep);
+	xfs_dir2_data_check(dp, bp);
+	xfs_da_buf_done(bp);
+	return 0;
+}
+
+/*
+ * Readdir for block directories.
+ */
+int						/* error */
+xfs_dir2_block_getdents(
+	xfs_trans_t		*tp,		/* transaction (NULL) */
+	xfs_inode_t		*dp,		/* incore inode */
+	uio_t			*uio,		/* caller's buffer control */
+	int			*eofp,		/* eof reached? (out) */
+	xfs_dirent_t		*dbp,		/* caller's buffer */
+	xfs_dir2_put_t		put)		/* abi's formatting function */
+{
+	xfs_dir2_block_t	*block;		/* directory block structure */
+	xfs_dabuf_t		*bp;		/* buffer for block */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_dir2_data_unused_t	*dup;		/* block unused entry */
+	char			*endptr;	/* end of the data entries */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_put_args_t	p;		/* arg package for put rtn */
+	char			*ptr;		/* current data entry */
+	char			*savptr;	/* saved data entry */
+	int			wantoff;	/* starting block offset */
+
+	mp = dp->i_mount;
+	/*
+	 * If the block number in the offset is out of range, we're done.
+	 */
+	if (XFS_DIR2_DATAPTR_TO_DB(mp, uio->uio_offset) > mp->m_dirdatablk) {
+		*eofp = 1;
+		return 0;
+	}
+	/*
+	 * Can't read the block, give up, else get dabuf in bp.
+	 */
+	if ((error =
+	    xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
+		return error;
+	}
+	ASSERT(bp != NULL);
+	/*
+	 * Extract the byte offset we start at from the seek pointer.
+	 * We'll skip entries before this.
+	 */
+	wantoff = XFS_DIR2_DATAPTR_TO_OFF(mp, uio->uio_offset);
+	block = bp->data;
+	xfs_dir2_data_check(dp, bp);
+	/*
+	 * Set up values for the loop.
+	 */
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	ptr = (char *)block->u;
+	endptr = (char *)XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	p.dbp = dbp;
+	p.put = put;
+	p.uio = uio;
+	/*
+	 * Loop over the data portion of the block.
+	 * Each object is a real entry (dep) or an unused one (dup).
+	 */
+	while (ptr < endptr) {
+		dup = (xfs_dir2_data_unused_t *)ptr;
+		/*
+		 * Unused, skip it.
+		 */
+		if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
+			ptr += INT_GET(dup->length, ARCH_CONVERT);
+			continue;
+		}
+
+		dep = (xfs_dir2_data_entry_t *)ptr;
+
+		savptr = ptr;		/* In case we need it.. */
+
+		/*
+		 * Bump pointer for the next iteration.
+		 */
+		ptr += XFS_DIR2_DATA_ENTSIZE(dep->namelen);
+		/*
+		 * The entry is before the desired starting point, skip it.
+		 */
+		if ((char *)dep - (char *)block < wantoff)
+			continue;
+		/*
+		 * Set up argument structure for put routine.
+		 */
+		p.namelen = dep->namelen;
+
+		/*
+		 * NOTE! Linux "filldir" semantics require that the
+		 *	 offset "cookie" be for this entry, not the
+		 *	 next; all the actual shuffling to make it
+		 *	 "look right" to the user is done in filldir.
+		 */
+		p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+						    savptr - (char *)block);
+#if XFS_BIG_FILESYSTEMS
+		p.ino = INT_GET(dep->inumber, ARCH_CONVERT) + mp->m_inoadd;
+#else
+		p.ino = INT_GET(dep->inumber, ARCH_CONVERT);
+#endif
+		p.name = (char *)dep->name;
+
+		/*
+		 * Put the entry in the caller's buffer.
+		 */
+		error = p.put(&p);
+
+		/*
+		 * If it didn't fit, set the final offset to here & return.
+		 */
+		if (!p.done) {
+			uio->uio_offset =
+				XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+					(char *)dep - (char *)block);
+			xfs_da_brelse(tp, bp);
+			return error;
+		}
+	}
+
+	/*
+	 * Reached the end of the block.
+	 * Set the offset to a nonexistent block 1 and return.
+	 */
+	*eofp = 1;
+
+	uio->uio_offset =
+		XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk + 1, 0);
+
+	xfs_da_brelse(tp, bp);
+
+	return 0;
+}
+
+/*
+ * Log leaf entries from the block.
+ */
+static void
+xfs_dir2_block_log_leaf(
+	xfs_trans_t		*tp,		/* transaction structure */
+	xfs_dabuf_t		*bp,		/* block buffer */
+	int			first,		/* index of first logged leaf */
+	int			last)		/* index of last logged leaf */
+{
+	xfs_dir2_block_t	*block;		/* directory block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+
+	mp = tp->t_mountp;
+	block = bp->data;
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	xfs_da_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)block),
+		(uint)((char *)&blp[last + 1] - (char *)block - 1));
+}
+
+/*
+ * Log the block tail.
+ */
+static void
+xfs_dir2_block_log_tail(
+	xfs_trans_t		*tp,		/* transaction structure */
+	xfs_dabuf_t		*bp)		/* block buffer */
+{
+	xfs_dir2_block_t	*block;		/* directory block structure */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+
+	mp = tp->t_mountp;
+	block = bp->data;
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	xfs_da_log_buf(tp, bp, (uint)((char *)btp - (char *)block),
+		(uint)((char *)(btp + 1) - (char *)block - 1));
+}
+
+/*
+ * Look up an entry in the block.  This is the external routine,
+ * xfs_dir2_block_lookup_int does the real work.
+ */
+int						/* error */
+xfs_dir2_block_lookup(
+	xfs_da_args_t		*args)		/* dir lookup arguments */
+{
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	xfs_dabuf_t		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			ent;		/* entry index */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+
+	xfs_dir2_trace_args("block_lookup", args);
+	/*
+	 * Get the buffer, look up the entry.
+	 * If not found (ENOENT) then return, have no buffer.
+	 */
+	if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent)))
+		return error;
+	dp = args->dp;
+	mp = dp->i_mount;
+	block = bp->data;
+	xfs_dir2_data_check(dp, bp);
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	/*
+	 * Get the offset from the leaf entry, to point to the data.
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(blp[ent].address, ARCH_CONVERT)));
+	/*
+	 * Fill in inode number, release the block.
+	 */
+	args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
+	xfs_da_brelse(args->trans, bp);
+	return XFS_ERROR(EEXIST);
+}
+
+/*
+ * Internal block lookup routine.
+ */
+static int					/* error */
+xfs_dir2_block_lookup_int(
+	xfs_da_args_t		*args,		/* dir lookup arguments */
+	xfs_dabuf_t		**bpp,		/* returned block buffer */
+	int			*entno)		/* returned entry number */
+{
+	xfs_dir2_dataptr_t	addr;		/* data entry address */
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	xfs_dabuf_t		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			error;		/* error return value */
+	xfs_dahash_t		hash;		/* found hash value */
+	int			high;		/* binary search high index */
+	int			low;		/* binary search low index */
+	int			mid;		/* binary search current idx */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	/*
+	 * Read the buffer, return error if we can't get it.
+	 */
+	if ((error =
+	    xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
+		return error;
+	}
+	ASSERT(bp != NULL);
+	block = bp->data;
+	xfs_dir2_data_check(dp, bp);
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	/*
+	 * Loop doing a binary search for our hash value.
+	 * Find our entry, ENOENT if it's not there.
+	 */
+	for (low = 0, high = INT_GET(btp->count, ARCH_CONVERT) - 1; ; ) {
+		ASSERT(low <= high);
+		mid = (low + high) >> 1;
+		if ((hash = INT_GET(blp[mid].hashval, ARCH_CONVERT)) == args->hashval)
+			break;
+		if (hash < args->hashval)
+			low = mid + 1;
+		else
+			high = mid - 1;
+		if (low > high) {
+			ASSERT(args->oknoent);
+			xfs_da_brelse(tp, bp);
+			return XFS_ERROR(ENOENT);
+		}
+	}
+	/*
+	 * Back up to the first one with the right hash value.
+	 */
+	while (mid > 0 && INT_GET(blp[mid - 1].hashval, ARCH_CONVERT) == args->hashval) {
+		mid--;
+	}
+	/*
+	 * Now loop forward through all the entries with the
+	 * right hash value looking for our name.
+	 */
+	do {
+		if ((addr = INT_GET(blp[mid].address, ARCH_CONVERT)) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Get pointer to the entry from the leaf.
+		 */
+		dep = (xfs_dir2_data_entry_t *)
+			((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, addr));
+		/*
+		 * Compare, if it's right give back buffer & entry number.
+		 */
+		if (dep->namelen == args->namelen &&
+		    dep->name[0] == args->name[0] &&
+		    bcmp(dep->name, args->name, args->namelen) == 0) {
+			*bpp = bp;
+			*entno = mid;
+			return 0;
+		}
+	} while (++mid < INT_GET(btp->count, ARCH_CONVERT) && INT_GET(blp[mid].hashval, ARCH_CONVERT) == hash);
+	/*
+	 * No match, release the buffer and return ENOENT.
+	 */
+	ASSERT(args->oknoent);
+	xfs_da_brelse(tp, bp);
+	return XFS_ERROR(ENOENT);
+}
+
+/*
+ * Remove an entry from a block format directory.
+ * If that makes the block small enough to fit in shortform, transform it.
+ */
+int						/* error */
+xfs_dir2_block_removename(
+	xfs_da_args_t		*args)		/* directory operation args */
+{
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf pointer */
+	xfs_dabuf_t		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			ent;		/* block leaf entry index */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log block header */
+	int			needscan;	/* need to fixup bestfree */
+	xfs_dir2_sf_hdr_t	sfh;		/* shortform header */
+	int			size;		/* shortform size */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args("block_removename", args);
+	/*
+	 * Look up the entry in the block.  Gets the buffer and entry index.
+	 * It will always be there, the vnodeops level does a lookup first.
+	 */
+	if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
+		return error;
+	}
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	block = bp->data;
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	/*
+	 * Point to the data entry using the leaf entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(blp[ent].address, ARCH_CONVERT)));
+	/*
+	 * Mark the data entry's space free.
+	 */
+	needlog = needscan = 0;
+	xfs_dir2_data_make_free(tp, bp,
+		(xfs_dir2_data_aoff_t)((char *)dep - (char *)block),
+		XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan);
+	/*
+	 * Fix up the block tail.
+	 */
+	INT_MOD(btp->stale, ARCH_CONVERT, +1);
+	xfs_dir2_block_log_tail(tp, bp);
+	/*
+	 * Remove the leaf entry by marking it stale.
+	 */
+	INT_SET(blp[ent].address, ARCH_CONVERT, XFS_DIR2_NULL_DATAPTR);
+	xfs_dir2_block_log_leaf(tp, bp, ent, ent);
+	/*
+	 * Fix up bestfree, log the header if necessary.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
+			NULL);
+	if (needlog)
+		xfs_dir2_data_log_header(tp, bp);
+	xfs_dir2_data_check(dp, bp);
+	/*
+	 * See if the size as a shortform is good enough.
+	 */
+	if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) >
+	    XFS_IFORK_DSIZE(dp)) {
+		xfs_da_buf_done(bp);
+		return 0;
+	}
+	/*
+	 * If it works, do the conversion.
+	 */
+	return xfs_dir2_block_to_sf(args, bp, size, &sfh);
+}
+
+/*
+ * Replace an entry in a V2 block directory.
+ * Change the inode number to the new value.
+ */
+int						/* error */
+xfs_dir2_block_replace(
+	xfs_da_args_t		*args)		/* directory operation args */
+{
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	xfs_dabuf_t		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* block data entry */
+	xfs_inode_t		*dp;		/* incore inode */
+	int			ent;		/* leaf entry index */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+
+	xfs_dir2_trace_args("block_replace", args);
+	/*
+	 * Lookup the entry in the directory.  Get buffer and entry index.
+	 * This will always succeed since the caller has already done a lookup.
+	 */
+	if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) {
+		return error;
+	}
+	dp = args->dp;
+	mp = dp->i_mount;
+	block = bp->data;
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	/*
+	 * Point to the data entry we need to change.
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(blp[ent].address, ARCH_CONVERT)));
+	ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) != args->inumber);
+	/*
+	 * Change the inode number to the new value.
+	 */
+	INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+	xfs_dir2_data_log_entry(args->trans, bp, dep);
+	xfs_dir2_data_check(dp, bp);
+	xfs_da_buf_done(bp);
+	return 0;
+}
+
+/*
+ * Qsort comparison routine for the block leaf entries.
+ */
+static int					/* sort order */
+xfs_dir2_block_sort(
+	const void			*a,	/* first leaf entry */
+	const void			*b)	/* second leaf entry */
+{
+	const xfs_dir2_leaf_entry_t	*la;	/* first leaf entry */
+	const xfs_dir2_leaf_entry_t	*lb;	/* second leaf entry */
+
+	la = a;
+	lb = b;
+	return INT_GET(la->hashval, ARCH_CONVERT) < INT_GET(lb->hashval, ARCH_CONVERT) ? -1 :
+		(INT_GET(la->hashval, ARCH_CONVERT) > INT_GET(lb->hashval, ARCH_CONVERT) ? 1 : 0);
+}
+
+/*
+ * Convert a V2 leaf directory to a V2 block directory if possible.
+ */
+int						/* error */
+xfs_dir2_leaf_to_block(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		*lbp,		/* leaf buffer */
+	xfs_dabuf_t		*dbp)		/* data buffer */
+{
+	xfs_dir2_data_off_t	*bestsp;	/* leaf bests table */
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* unused data entry */
+	int			error;		/* error return value */
+	int			from;		/* leaf from index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* file system mount point */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to scan for bestfree */
+	xfs_dir2_sf_hdr_t	sfh;		/* shortform header */
+	int			size;		/* bytes used */
+	xfs_dir2_data_off_t	*tagp;		/* end of entry (tag) */
+	int			to;		/* block/leaf to index */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args_bb("leaf_to_block", args, lbp, dbp);
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = lbp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC);
+	ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+	/*
+	 * If there are data blocks other than the first one, take this
+	 * opportunity to remove trailing empty data blocks that may have
+	 * been left behind during no-space-reservation operations.
+	 * These will show up in the leaf bests table.
+	 */
+	while (dp->i_d.di_size > mp->m_dirblksize) {
+		bestsp = XFS_DIR2_LEAF_BESTS_P_ARCH(ltp, ARCH_CONVERT);
+		if (INT_GET(bestsp[INT_GET(ltp->bestcount, ARCH_CONVERT) - 1], ARCH_CONVERT) ==
+		    mp->m_dirblksize - (uint)sizeof(block->hdr)) {
+			if ((error =
+			    xfs_dir2_leaf_trim_data(args, lbp,
+				    (xfs_dir2_db_t)(INT_GET(ltp->bestcount, ARCH_CONVERT) - 1))))
+				goto out;
+		} else {
+			error = 0;
+			goto out;
+		}
+	}
+	/*
+	 * Read the data block if we don't already have it, give up if it fails.
+	 */
+	if (dbp == NULL &&
+	    (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp,
+		    XFS_DATA_FORK))) {
+		goto out;
+	}
+	block = dbp->data;
+	ASSERT(INT_GET(block->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC);
+	/*
+	 * Size of the "leaf" area in the block.
+	 */
+	size = (uint)sizeof(block->tail) +
+	       (uint)sizeof(*lep) * (INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT));
+	/*
+	 * Look at the last data entry.
+	 */
+	tagp = (xfs_dir2_data_off_t *)((char *)block + mp->m_dirblksize) - 1;
+	dup = (xfs_dir2_data_unused_t *)((char *)block + INT_GET(*tagp, ARCH_CONVERT));
+	/*
+	 * If it's not free or is too short we can't do it.
+	 */
+	if (INT_GET(dup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG || INT_GET(dup->length, ARCH_CONVERT) < size) {
+		error = 0;
+		goto out;
+	}
+	/*
+	 * Start converting it to block form.
+	 */
+	INT_SET(block->hdr.magic, ARCH_CONVERT, XFS_DIR2_BLOCK_MAGIC);
+	needlog = 1;
+	needscan = 0;
+	/*
+	 * Use up the space at the end of the block (blp/btp).
+	 */
+	xfs_dir2_data_use_free(tp, dbp, dup, mp->m_dirblksize - size, size,
+		&needlog, &needscan);
+	/*
+	 * Initialize the block tail.
+	 */
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	INT_SET(btp->count, ARCH_CONVERT, INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT));
+	INT_ZERO(btp->stale, ARCH_CONVERT);
+	xfs_dir2_block_log_tail(tp, dbp);
+	/*
+	 * Initialize the block leaf area.  We compact out stale entries.
+	 */
+	lep = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	for (from = to = 0; from < INT_GET(leaf->hdr.count, ARCH_CONVERT); from++) {
+		if (INT_GET(leaf->ents[from].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		lep[to++] = leaf->ents[from];
+	}
+	ASSERT(to == INT_GET(btp->count, ARCH_CONVERT));
+	xfs_dir2_block_log_leaf(tp, dbp, 0, INT_GET(btp->count, ARCH_CONVERT) - 1);
+	/*
+	 * Scan the bestfree if we need it and log the data block header.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
+			NULL);
+	if (needlog)
+		xfs_dir2_data_log_header(tp, dbp);
+	/*
+	 * Pitch the old leaf block.
+	 */
+	error = xfs_da_shrink_inode(args, mp->m_dirleafblk, lbp);
+	lbp = NULL;
+	if (error) {
+		goto out;
+	}
+	/*
+	 * Now see if the resulting block can be shrunken to shortform.
+	 */
+	if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) >
+	    XFS_IFORK_DSIZE(dp)) {
+		error = 0;
+		goto out;
+	}
+	return xfs_dir2_block_to_sf(args, dbp, size, &sfh);
+out:
+	if (lbp)
+		xfs_da_buf_done(lbp);
+	if (dbp)
+		xfs_da_buf_done(dbp);
+	return error;
+}
+
+/*
+ * Convert the shortform directory to block form.
+ */
+int						/* error */
+xfs_dir2_sf_to_block(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_dir2_db_t		blkno;		/* dir-relative block # (0) */
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
+	xfs_dabuf_t		*bp;		/* block buffer */
+	xfs_dir2_block_tail_t	*btp;		/* block tail pointer */
+	char			*buf;		/* sf buffer */
+	int			buf_len;
+	xfs_dir2_data_entry_t	*dep;		/* data entry pointer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			dummy;		/* trash */
+	xfs_dir2_data_unused_t	*dup;		/* unused entry pointer */
+	int			endoffset;	/* end of data objects */
+	int			error;		/* error return value */
+	int			i;		/* index */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log block header */
+	int			needscan;	/* need to scan block freespc */
+	int			newoffset;	/* offset from current entry */
+	int			offset;		/* target block offset */
+	xfs_dir2_sf_entry_t	*sfep;		/* sf entry pointer */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+	xfs_dir2_data_off_t	*tagp;		/* end of data entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args("sf_to_block", args);
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Bomb out if the shortform directory is way too short.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
+	/*
+	 * Copy the directory into the stack buffer.
+	 * Then pitch the incore inode data so we can make extents.
+	 */
+
+	buf_len = dp->i_df.if_bytes;
+	buf = kmem_alloc(dp->i_df.if_bytes, KM_SLEEP);
+
+	bcopy(sfp, buf, dp->i_df.if_bytes);
+	xfs_idata_realloc(dp, -dp->i_df.if_bytes, XFS_DATA_FORK);
+	dp->i_d.di_size = 0;
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+	/*
+	 * Reset pointer - old sfp is gone.
+	 */
+	sfp = (xfs_dir2_sf_t *)buf;
+	/*
+	 * Add block 0 to the inode.
+	 */
+	error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno);
+	if (error) {
+		kmem_free(buf, buf_len);
+		return error;
+	}
+	/*
+	 * Initialize the data block.
+	 */
+	error = xfs_dir2_data_init(args, blkno, &bp);
+	if (error) {
+		kmem_free(buf, buf_len);
+		return error;
+	}
+	block = bp->data;
+	INT_SET(block->hdr.magic, ARCH_CONVERT, XFS_DIR2_BLOCK_MAGIC);
+	/*
+	 * Compute size of block "tail" area.
+	 */
+	i = (uint)sizeof(*btp) +
+	    (INT_GET(sfp->hdr.count, ARCH_CONVERT) + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t);
+	/*
+	 * The whole thing is initialized to free by the init routine.
+	 * Say we're using the leaf and tail area.
+	 */
+	dup = (xfs_dir2_data_unused_t *)block->u;
+	needlog = needscan = 0;
+	xfs_dir2_data_use_free(tp, bp, dup, mp->m_dirblksize - i, i, &needlog,
+		&needscan);
+	ASSERT(needscan == 0);
+	/*
+	 * Fill in the tail.
+	 */
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	INT_SET(btp->count, ARCH_CONVERT, INT_GET(sfp->hdr.count, ARCH_CONVERT) + 2);	/* ., .. */
+	INT_ZERO(btp->stale, ARCH_CONVERT);
+	blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	endoffset = (uint)((char *)blp - (char *)block);
+	/*
+	 * Remove the freespace, we'll manage it.
+	 */
+	xfs_dir2_data_use_free(tp, bp, dup,
+		(xfs_dir2_data_aoff_t)((char *)dup - (char *)block),
+		INT_GET(dup->length, ARCH_CONVERT), &needlog, &needscan);
+	/*
+	 * Create entry for .
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)block + XFS_DIR2_DATA_DOT_OFFSET);
+	INT_SET(dep->inumber, ARCH_CONVERT, dp->i_ino);
+	dep->namelen = 1;
+	dep->name[0] = '.';
+	tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
+	INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block));
+	xfs_dir2_data_log_entry(tp, bp, dep);
+	INT_SET(blp[0].hashval, ARCH_CONVERT, xfs_dir_hash_dot);
+	INT_SET(blp[0].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp, (char *)dep - (char *)block));
+	/*
+	 * Create entry for ..
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+		((char *)block + XFS_DIR2_DATA_DOTDOT_OFFSET);
+	INT_SET(dep->inumber, ARCH_CONVERT, XFS_DIR2_SF_GET_INUMBER_ARCH(sfp, &sfp->hdr.parent, ARCH_CONVERT));
+	dep->namelen = 2;
+	dep->name[0] = dep->name[1] = '.';
+	tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
+	INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block));
+	xfs_dir2_data_log_entry(tp, bp, dep);
+	INT_SET(blp[1].hashval, ARCH_CONVERT, xfs_dir_hash_dotdot);
+	INT_SET(blp[1].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp, (char *)dep - (char *)block));
+	offset = XFS_DIR2_DATA_FIRST_OFFSET;
+	/*
+	 * Loop over existing entries, stuff them in.
+	 */
+	if ((i = 0) == INT_GET(sfp->hdr.count, ARCH_CONVERT))
+		sfep = NULL;
+	else
+		sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+	/*
+	 * Need to preserve the existing offset values in the sf directory.
+	 * Insert holes (unused entries) where necessary.
+	 */
+	while (offset < endoffset) {
+		/*
+		 * sfep is null when we reach the end of the list.
+		 */
+		if (sfep == NULL)
+			newoffset = endoffset;
+		else
+			newoffset = XFS_DIR2_SF_GET_OFFSET_ARCH(sfep, ARCH_CONVERT);
+		/*
+		 * There should be a hole here, make one.
+		 */
+		if (offset < newoffset) {
+			dup = (xfs_dir2_data_unused_t *)
+			      ((char *)block + offset);
+			INT_SET(dup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
+			INT_SET(dup->length, ARCH_CONVERT, newoffset - offset);
+			INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(dup, ARCH_CONVERT), ARCH_CONVERT,
+				(xfs_dir2_data_off_t)
+				((char *)dup - (char *)block));
+			xfs_dir2_data_log_unused(tp, bp, dup);
+			(void)xfs_dir2_data_freeinsert((xfs_dir2_data_t *)block,
+				dup, &dummy);
+			offset += INT_GET(dup->length, ARCH_CONVERT);
+			continue;
+		}
+		/*
+		 * Copy a real entry.
+		 */
+		dep = (xfs_dir2_data_entry_t *)((char *)block + newoffset);
+		INT_SET(dep->inumber, ARCH_CONVERT, XFS_DIR2_SF_GET_INUMBER_ARCH(sfp,
+				XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT));
+		dep->namelen = sfep->namelen;
+		bcopy(sfep->name, dep->name, dep->namelen);
+		tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
+		INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)block));
+		xfs_dir2_data_log_entry(tp, bp, dep);
+		INT_SET(blp[2 + i].hashval, ARCH_CONVERT, xfs_da_hashname((char *)sfep->name, sfep->namelen));
+		INT_SET(blp[2 + i].address, ARCH_CONVERT, XFS_DIR2_BYTE_TO_DATAPTR(mp,
+						 (char *)dep - (char *)block));
+		offset = (int)((char *)(tagp + 1) - (char *)block);
+		if (++i == INT_GET(sfp->hdr.count, ARCH_CONVERT))
+			sfep = NULL;
+		else
+			sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
+	}
+	/* Done with the temporary buffer */
+	kmem_free(buf, buf_len);
+	/*
+	 * Sort the leaf entries by hash value.
+	 */
+	qsort(blp, INT_GET(btp->count, ARCH_CONVERT), sizeof(*blp), xfs_dir2_block_sort);
+	/*
+	 * Log the leaf entry area and tail.
+	 * Already logged the header in data_init, ignore needlog.
+	 */
+	ASSERT(needscan == 0);
+	xfs_dir2_block_log_leaf(tp, bp, 0, INT_GET(btp->count, ARCH_CONVERT) - 1);
+	xfs_dir2_block_log_tail(tp, bp);
+	xfs_dir2_data_check(dp, bp);
+	xfs_da_buf_done(bp);
+	return 0;
+}
diff --git a/fs/xfs/xfs_dir2_block.h b/fs/xfs/xfs_dir2_block.h
new file mode 100644
index 000000000000..16fdf22bf17d
--- /dev/null
+++ b/fs/xfs/xfs_dir2_block.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR2_BLOCK_H__
+#define __XFS_DIR2_BLOCK_H__
+
+/*
+ * xfs_dir2_block.h
+ * Directory version 2, single block format structures
+ */
+
+struct dirent;
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_dir2_data_hdr;
+struct xfs_dir2_leaf_entry;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * The single block format is as follows:
+ * xfs_dir2_data_hdr_t structure
+ * xfs_dir2_data_entry_t and xfs_dir2_data_unused_t structures
+ * xfs_dir2_leaf_entry_t structures
+ * xfs_dir2_block_tail_t structure
+ */
+
+#define XFS_DIR2_BLOCK_MAGIC	0x58443242	/* XD2B: for one block dirs */
+
+typedef struct xfs_dir2_block_tail {
+	__uint32_t	count;			/* count of leaf entries */
+	__uint32_t	stale;			/* count of stale lf entries */
+} xfs_dir2_block_tail_t;
+
+/*
+ * Generic single-block structure, for xfs_db.
+ */
+typedef struct xfs_dir2_block {
+	xfs_dir2_data_hdr_t	hdr;		/* magic XFS_DIR2_BLOCK_MAGIC */
+	xfs_dir2_data_union_t	u[1];
+	xfs_dir2_leaf_entry_t	leaf[1];
+	xfs_dir2_block_tail_t	tail;
+} xfs_dir2_block_t;
+
+/*
+ * Pointer to the leaf header embedded in a data block (1-block format)
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BLOCK_TAIL_P)
+xfs_dir2_block_tail_t *
+xfs_dir2_block_tail_p(struct xfs_mount *mp, xfs_dir2_block_t *block);
+#define XFS_DIR2_BLOCK_TAIL_P(mp,block) xfs_dir2_block_tail_p(mp,block)
+#else
+#define XFS_DIR2_BLOCK_TAIL_P(mp,block) \
+	(((xfs_dir2_block_tail_t *)((char *)(block) + (mp)->m_dirblksize)) - 1)
+#endif
+
+/*
+ * Pointer to the leaf entries embedded in a data block (1-block format)
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BLOCK_LEAF_P)
+struct xfs_dir2_leaf_entry *xfs_dir2_block_leaf_p_arch(
+	xfs_dir2_block_tail_t *btp, xfs_arch_t arch);
+#define XFS_DIR2_BLOCK_LEAF_P_ARCH(btp,arch) \
+	xfs_dir2_block_leaf_p_arch(btp,arch)
+#else
+#define XFS_DIR2_BLOCK_LEAF_P_ARCH(btp,arch)	\
+	(((struct xfs_dir2_leaf_entry *)(btp)) - INT_GET((btp)->count, arch))
+#endif
+
+/*
+ * Function declarations.
+ */
+
+extern int
+	xfs_dir2_block_addname(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_block_getdents(struct xfs_trans *tp, struct xfs_inode *dp,
+				struct uio *uio, int *eofp, struct xfs_dirent *dbp,
+				xfs_dir2_put_t put);
+
+extern int
+	xfs_dir2_block_lookup(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_block_removename(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_block_replace(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_leaf_to_block(struct xfs_da_args *args, struct xfs_dabuf *lbp,
+			       struct xfs_dabuf *dbp);
+
+extern int
+	xfs_dir2_sf_to_block(struct xfs_da_args *args);
+
+#endif	/* __XFS_DIR2_BLOCK_H__ */
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
new file mode 100644
index 000000000000..7ea956729cf1
--- /dev/null
+++ b/fs/xfs/xfs_dir2_data.c
@@ -0,0 +1,833 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * xfs_dir2_data.c
+ * Core data block handling routines for XFS V2 directories.
+ * See xfs_dir2_data.h for data structures.
+ */
+
+#include <xfs.h>
+
+
+#ifdef DEBUG
+/*
+ * Check the consistency of the data block.
+ * The input can also be a block-format directory.
+ * Pop an assert if we find anything bad.
+ */
+void
+xfs_dir2_data_check(
+	xfs_inode_t		*dp,		/* incore inode pointer */
+	xfs_dabuf_t		*bp)		/* data block's buffer */
+{
+	xfs_dir2_dataptr_t	addr;		/* addr for leaf lookup */
+	xfs_dir2_data_free_t	*bf;		/* bestfree table */
+	xfs_dir2_block_tail_t	*btp=NULL;	/* block tail */
+	int			count;		/* count of entries found */
+	xfs_dir2_data_t		*d;		/* data block pointer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry */
+	xfs_dir2_data_free_t	*dfp;		/* bestfree entry */
+	xfs_dir2_data_unused_t	*dup;		/* unused entry */
+	char			*endp;		/* end of useful data */
+	int			freeseen;	/* mask of bestfrees seen */
+	xfs_dahash_t		hash;		/* hash of current name */
+	int			i;		/* leaf index */
+	int			lastfree;	/* last entry was unused */
+	xfs_dir2_leaf_entry_t	*lep=NULL;	/* block leaf entries */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	char			*p;		/* current data position */
+	int			stale;		/* count of stale leaves */
+
+	mp = dp->i_mount;
+	d = bp->data;
+	ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+	       INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+	bf = d->hdr.bestfree;
+	p = (char *)d->u;
+	if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) {
+		btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d);
+		lep = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+		endp = (char *)lep;
+	} else
+		endp = (char *)d + mp->m_dirblksize;
+	count = lastfree = freeseen = 0;
+	/*
+	 * Account for zero bestfree entries.
+	 */
+	if (INT_ISZERO(bf[0].length, ARCH_CONVERT)) {
+		ASSERT(INT_ISZERO(bf[0].offset, ARCH_CONVERT));
+		freeseen |= 1 << 0;
+	}
+	if (INT_ISZERO(bf[1].length, ARCH_CONVERT)) {
+		ASSERT(INT_ISZERO(bf[1].offset, ARCH_CONVERT));
+		freeseen |= 1 << 1;
+	}
+	if (INT_ISZERO(bf[2].length, ARCH_CONVERT)) {
+		ASSERT(INT_ISZERO(bf[2].offset, ARCH_CONVERT));
+		freeseen |= 1 << 2;
+	}
+	ASSERT(INT_GET(bf[0].length, ARCH_CONVERT) >= INT_GET(bf[1].length, ARCH_CONVERT));
+	ASSERT(INT_GET(bf[1].length, ARCH_CONVERT) >= INT_GET(bf[2].length, ARCH_CONVERT));
+	/*
+	 * Loop over the data/unused entries.
+	 */
+	while (p < endp) {
+		dup = (xfs_dir2_data_unused_t *)p;
+		/*
+		 * If it's unused, look for the space in the bestfree table.
+		 * If we find it, account for that, else make sure it
+		 * doesn't need to be there.
+		 */
+		if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
+			ASSERT(lastfree == 0);
+			ASSERT(INT_GET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(dup, ARCH_CONVERT), ARCH_CONVERT) ==
+			       (char *)dup - (char *)d);
+			dfp = xfs_dir2_data_freefind(d, dup);
+			if (dfp) {
+				i = (int)(dfp - bf);
+				ASSERT((freeseen & (1 << i)) == 0);
+				freeseen |= 1 << i;
+			} else
+				ASSERT(INT_GET(dup->length, ARCH_CONVERT) <= INT_GET(bf[2].length, ARCH_CONVERT));
+			p += INT_GET(dup->length, ARCH_CONVERT);
+			lastfree = 1;
+			continue;
+		}
+		/*
+		 * It's a real entry.  Validate the fields.
+		 * If this is a block directory then make sure it's
+		 * in the leaf section of the block.
+		 * The linear search is crude but this is DEBUG code.
+		 */
+		dep = (xfs_dir2_data_entry_t *)p;
+		ASSERT(dep->namelen != 0);
+		ASSERT(xfs_dir_ino_validate(mp, INT_GET(dep->inumber, ARCH_CONVERT)) == 0);
+		ASSERT(INT_GET(*XFS_DIR2_DATA_ENTRY_TAG_P(dep), ARCH_CONVERT) ==
+		       (char *)dep - (char *)d);
+		count++;
+		lastfree = 0;
+		if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) {
+			addr = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+				(xfs_dir2_data_aoff_t)
+				((char *)dep - (char *)d));
+			hash = xfs_da_hashname((char *)dep->name, dep->namelen);
+			for (i = 0; i < INT_GET(btp->count, ARCH_CONVERT); i++) {
+				if (INT_GET(lep[i].address, ARCH_CONVERT) == addr &&
+				    INT_GET(lep[i].hashval, ARCH_CONVERT) == hash)
+					break;
+			}
+			ASSERT(i < INT_GET(btp->count, ARCH_CONVERT));
+		}
+		p += XFS_DIR2_DATA_ENTSIZE(dep->namelen);
+	}
+	/*
+	 * Need to have seen all the entries and all the bestfree slots.
+	 */
+	ASSERT(freeseen == 7);
+	if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) {
+		for (i = stale = 0; i < INT_GET(btp->count, ARCH_CONVERT); i++) {
+			if (INT_GET(lep[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+				stale++;
+			if (i > 0)
+				ASSERT(INT_GET(lep[i].hashval, ARCH_CONVERT) >= INT_GET(lep[i - 1].hashval, ARCH_CONVERT));
+		}
+		ASSERT(count == INT_GET(btp->count, ARCH_CONVERT) - INT_GET(btp->stale, ARCH_CONVERT));
+		ASSERT(stale == INT_GET(btp->stale, ARCH_CONVERT));
+	}
+}
+#endif
+
+/*
+ * Given a data block and an unused entry from that block,
+ * return the bestfree entry if any that corresponds to it.
+ */
+xfs_dir2_data_free_t *
+xfs_dir2_data_freefind(
+	xfs_dir2_data_t		*d,		/* data block */
+	xfs_dir2_data_unused_t	*dup)		/* data unused entry */
+{
+	xfs_dir2_data_free_t	*dfp;		/* bestfree entry */
+	xfs_dir2_data_aoff_t	off;		/* offset value needed */
+#if defined(DEBUG) && defined(__KERNEL__)
+	int			matched;	/* matched the value */
+	int			seenzero;	/* saw a 0 bestfree entry */
+#endif
+
+	off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)d);
+#if defined(DEBUG) && defined(__KERNEL__)
+	/*
+	 * Validate some consistency in the bestfree table.
+	 * Check order, non-overlapping entries, and if we find the
+	 * one we're looking for it has to be exact.
+	 */
+	ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+	       INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+	for (dfp = &d->hdr.bestfree[0], seenzero = matched = 0;
+	     dfp < &d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT];
+	     dfp++) {
+		if (INT_ISZERO(dfp->offset, ARCH_CONVERT)) {
+			ASSERT(INT_ISZERO(dfp->length, ARCH_CONVERT));
+			seenzero = 1;
+			continue;
+		}
+		ASSERT(seenzero == 0);
+		if (INT_GET(dfp->offset, ARCH_CONVERT) == off) {
+			matched = 1;
+			ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(dup->length, ARCH_CONVERT));
+		} else if (off < INT_GET(dfp->offset, ARCH_CONVERT))
+			ASSERT(off + INT_GET(dup->length, ARCH_CONVERT) <= INT_GET(dfp->offset, ARCH_CONVERT));
+		else
+			ASSERT(INT_GET(dfp->offset, ARCH_CONVERT) + INT_GET(dfp->length, ARCH_CONVERT) <= off);
+		ASSERT(matched || INT_GET(dfp->length, ARCH_CONVERT) >= INT_GET(dup->length, ARCH_CONVERT));
+		if (dfp > &d->hdr.bestfree[0])
+			ASSERT(INT_GET(dfp[-1].length, ARCH_CONVERT) >= INT_GET(dfp[0].length, ARCH_CONVERT));
+	}
+#endif
+	/*
+	 * If this is smaller than the smallest bestfree entry,
+	 * it can't be there since they're sorted.
+	 */
+	if (INT_GET(dup->length, ARCH_CONVERT) < INT_GET(d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT - 1].length, ARCH_CONVERT))
+		return NULL;
+	/*
+	 * Look at the three bestfree entries for our guy.
+	 */
+	for (dfp = &d->hdr.bestfree[0];
+	     dfp < &d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT];
+	     dfp++) {
+		if (INT_ISZERO(dfp->offset, ARCH_CONVERT))
+			return NULL;
+		if (INT_GET(dfp->offset, ARCH_CONVERT) == off)
+			return dfp;
+	}
+	/*
+	 * Didn't find it.  This only happens if there are duplicate lengths.
+	 */
+	return NULL;
+}
+
+/*
+ * Insert an unused-space entry into the bestfree table.
+ */
+xfs_dir2_data_free_t *				/* entry inserted */
+xfs_dir2_data_freeinsert(
+	xfs_dir2_data_t		*d,		/* data block pointer */
+	xfs_dir2_data_unused_t	*dup,		/* unused space */
+	int			*loghead)	/* log the data header (out) */
+{
+	xfs_dir2_data_free_t	*dfp;		/* bestfree table pointer */
+	xfs_dir2_data_free_t	new;		/* new bestfree entry */
+
+#ifdef __KERNEL__
+	ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+	       INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+#endif
+	dfp = d->hdr.bestfree;
+	INT_COPY(new.length, dup->length, ARCH_CONVERT);
+	INT_SET(new.offset, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dup - (char *)d));
+	/*
+	 * Insert at position 0, 1, or 2; or not at all.
+	 */
+	if (INT_GET(new.length, ARCH_CONVERT) > INT_GET(dfp[0].length, ARCH_CONVERT)) {
+		dfp[2] = dfp[1];
+		dfp[1] = dfp[0];
+		dfp[0] = new;
+		*loghead = 1;
+		return &dfp[0];
+	}
+	if (INT_GET(new.length, ARCH_CONVERT) > INT_GET(dfp[1].length, ARCH_CONVERT)) {
+		dfp[2] = dfp[1];
+		dfp[1] = new;
+		*loghead = 1;
+		return &dfp[1];
+	}
+	if (INT_GET(new.length, ARCH_CONVERT) > INT_GET(dfp[2].length, ARCH_CONVERT)) {
+		dfp[2] = new;
+		*loghead = 1;
+		return &dfp[2];
+	}
+	return NULL;
+}
+
+/*
+ * Remove a bestfree entry from the table.
+ */
+void
+xfs_dir2_data_freeremove(
+	xfs_dir2_data_t		*d,		/* data block pointer */
+	xfs_dir2_data_free_t	*dfp,		/* bestfree entry pointer */
+	int			*loghead)	/* out: log data header */
+{
+#ifdef __KERNEL__
+	ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+	       INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+#endif
+	/*
+	 * It's the first entry, slide the next 2 up.
+	 */
+	if (dfp == &d->hdr.bestfree[0]) {
+		d->hdr.bestfree[0] = d->hdr.bestfree[1];
+		d->hdr.bestfree[1] = d->hdr.bestfree[2];
+	}
+	/*
+	 * It's the second entry, slide the 3rd entry up.
+	 */
+	else if (dfp == &d->hdr.bestfree[1])
+		d->hdr.bestfree[1] = d->hdr.bestfree[2];
+	/*
+	 * Must be the last entry.
+	 */
+	else
+		ASSERT(dfp == &d->hdr.bestfree[2]);
+	/*
+	 * Clear the 3rd entry, must be zero now.
+	 */
+	INT_ZERO(d->hdr.bestfree[2].length, ARCH_CONVERT);
+	INT_ZERO(d->hdr.bestfree[2].offset, ARCH_CONVERT);
+	*loghead = 1;
+}
+
+/*
+ * Given a data block, reconstruct its bestfree map.
+ */
+void
+xfs_dir2_data_freescan(
+	xfs_mount_t		*mp,		/* filesystem mount point */
+	xfs_dir2_data_t		*d,		/* data block pointer */
+	int			*loghead,	/* out: log data header */
+	char			*aendp)		/* in: caller's endp */
+{
+	xfs_dir2_block_tail_t	*btp;		/* block tail */
+	xfs_dir2_data_entry_t	*dep;		/* active data entry */
+	xfs_dir2_data_unused_t	*dup;		/* unused data entry */
+	char			*endp;		/* end of block's data */
+	char			*p;		/* current entry pointer */
+
+#ifdef __KERNEL__
+	ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+	       INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+#endif
+	/*
+	 * Start by clearing the table.
+	 */
+	bzero(d->hdr.bestfree, sizeof(d->hdr.bestfree));
+	*loghead = 1;
+	/*
+	 * Set up pointers.
+	 */
+	p = (char *)d->u;
+	if (aendp)
+		endp = aendp;
+	else if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC) {
+		btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d);
+		endp = (char *)XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	} else
+		endp = (char *)d + mp->m_dirblksize;
+	/*
+	 * Loop over the block's entries.
+	 */
+	while (p < endp) {
+		dup = (xfs_dir2_data_unused_t *)p;
+		/*
+		 * If it's a free entry, insert it.
+		 */
+		if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
+			ASSERT((char *)dup - (char *)d ==
+			       INT_GET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(dup, ARCH_CONVERT), ARCH_CONVERT));
+			xfs_dir2_data_freeinsert(d, dup, loghead);
+			p += INT_GET(dup->length, ARCH_CONVERT);
+		}
+		/*
+		 * For active entries, check their tags and skip them.
+		 */
+		else {
+			dep = (xfs_dir2_data_entry_t *)p;
+			ASSERT((char *)dep - (char *)d ==
+			       INT_GET(*XFS_DIR2_DATA_ENTRY_TAG_P(dep), ARCH_CONVERT));
+			p += XFS_DIR2_DATA_ENTSIZE(dep->namelen);
+		}
+	}
+}
+
+/*
+ * Initialize a data block at the given block number in the directory.
+ * Give back the buffer for the created block.
+ */
+int						/* error */
+xfs_dir2_data_init(
+	xfs_da_args_t		*args,		/* directory operation args */
+	xfs_dir2_db_t		blkno,		/* logical dir block number */
+	xfs_dabuf_t		**bpp)		/* output block buffer */
+{
+	xfs_dabuf_t		*bp;		/* block buffer */
+	xfs_dir2_data_t		*d;		/* pointer to block */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* unused entry pointer */
+	int			error;		/* error return value */
+	int			i;		/* bestfree index */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	int			t;		/* temp */
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	/*
+	 * Get the buffer set up for the block.
+	 */
+	error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, blkno), -1, &bp,
+		XFS_DATA_FORK);
+	if (error) {
+		return error;
+	}
+	ASSERT(bp != NULL);
+	/*
+	 * Initialize the header.
+	 */
+	d = bp->data;
+	INT_SET(d->hdr.magic, ARCH_CONVERT, XFS_DIR2_DATA_MAGIC);
+	INT_SET(d->hdr.bestfree[0].offset, ARCH_CONVERT, (xfs_dir2_data_off_t)sizeof(d->hdr));
+	for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
+		INT_ZERO(d->hdr.bestfree[i].length, ARCH_CONVERT);
+		INT_ZERO(d->hdr.bestfree[i].offset, ARCH_CONVERT);
+	}
+	/*
+	 * Set up an unused entry for the block's body.
+	 */
+	dup = &d->u[0].unused;
+	INT_SET(dup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
+
+	t=mp->m_dirblksize - (uint)sizeof(d->hdr);
+	INT_SET(d->hdr.bestfree[0].length, ARCH_CONVERT, t);
+	INT_SET(dup->length, ARCH_CONVERT, t);
+	INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(dup, ARCH_CONVERT), ARCH_CONVERT,
+		(xfs_dir2_data_off_t)((char *)dup - (char *)d));
+	/*
+	 * Log it and return it.
+	 */
+	xfs_dir2_data_log_header(tp, bp);
+	xfs_dir2_data_log_unused(tp, bp, dup);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Log an active data entry from the block.
+ */
+void
+xfs_dir2_data_log_entry(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp,		/* block buffer */
+	xfs_dir2_data_entry_t	*dep)		/* data entry pointer */
+{
+	xfs_dir2_data_t		*d;		/* data block pointer */
+
+	d = bp->data;
+	ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+	       INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+	xfs_da_log_buf(tp, bp, (uint)((char *)dep - (char *)d),
+		(uint)((char *)(XFS_DIR2_DATA_ENTRY_TAG_P(dep) + 1) -
+		       (char *)d - 1));
+}
+
+/*
+ * Log a data block header.
+ */
+void
+xfs_dir2_data_log_header(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp)		/* block buffer */
+{
+	xfs_dir2_data_t		*d;		/* data block pointer */
+
+	d = bp->data;
+	ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+	       INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+	xfs_da_log_buf(tp, bp, (uint)((char *)&d->hdr - (char *)d),
+		(uint)(sizeof(d->hdr) - 1));
+}
+
+/*
+ * Log a data unused entry.
+ */
+void
+xfs_dir2_data_log_unused(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp,		/* block buffer */
+	xfs_dir2_data_unused_t	*dup)		/* data unused pointer */
+{
+	xfs_dir2_data_t		*d;		/* data block pointer */
+
+	d = bp->data;
+	ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+	       INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+	/*
+	 * Log the first part of the unused entry.
+	 */
+	xfs_da_log_buf(tp, bp, (uint)((char *)dup - (char *)d),
+		(uint)((char *)&dup->length + sizeof(dup->length) -
+		       1 - (char *)d));
+	/*
+	 * Log the end (tag) of the unused entry.
+	 */
+	xfs_da_log_buf(tp, bp,
+		(uint)((char *)XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(dup, ARCH_CONVERT) - (char *)d),
+		(uint)((char *)XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(dup, ARCH_CONVERT) - (char *)d +
+		       sizeof(xfs_dir2_data_off_t) - 1));
+}
+
+/*
+ * Make a byte range in the data block unused.
+ * Its current contents are unimportant.
+ */
+void
+xfs_dir2_data_make_free(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp,		/* block buffer */
+	xfs_dir2_data_aoff_t	offset,		/* starting byte offset */
+	xfs_dir2_data_aoff_t	len,		/* length in bytes */
+	int			*needlogp,	/* out: log header */
+	int			*needscanp)	/* out: regen bestfree */
+{
+	xfs_dir2_data_t		*d;		/* data block pointer */
+	xfs_dir2_data_free_t	*dfp;		/* bestfree pointer */
+	char			*endptr;	/* end of data area */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needscan;	/* need to regen bestfree */
+	xfs_dir2_data_unused_t	*newdup;	/* new unused entry */
+	xfs_dir2_data_unused_t	*postdup;	/* unused entry after us */
+	xfs_dir2_data_unused_t	*prevdup;	/* unused entry before us */
+
+	mp = tp->t_mountp;
+	d = bp->data;
+	/*
+	 * Figure out where the end of the data area is.
+	 */
+	if (INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC)
+		endptr = (char *)d + mp->m_dirblksize;
+	else {
+		xfs_dir2_block_tail_t	*btp;	/* block tail */
+
+		ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+		btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d);
+		endptr = (char *)XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	}
+	/*
+	 * If this isn't the start of the block, then back up to
+	 * the previous entry and see if it's free.
+	 */
+	if (offset > sizeof(d->hdr)) {
+		xfs_dir2_data_off_t	*tagp;	/* tag just before us */
+
+		tagp = (xfs_dir2_data_off_t *)((char *)d + offset) - 1;
+		prevdup = (xfs_dir2_data_unused_t *)((char *)d + INT_GET(*tagp, ARCH_CONVERT));
+		if (INT_GET(prevdup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG)
+			prevdup = NULL;
+	} else
+		prevdup = NULL;
+	/*
+	 * If this isn't the end of the block, see if the entry after
+	 * us is free.
+	 */
+	if ((char *)d + offset + len < endptr) {
+		postdup =
+			(xfs_dir2_data_unused_t *)((char *)d + offset + len);
+		if (INT_GET(postdup->freetag, ARCH_CONVERT) != XFS_DIR2_DATA_FREE_TAG)
+			postdup = NULL;
+	} else
+		postdup = NULL;
+	ASSERT(*needscanp == 0);
+	needscan = 0;
+	/*
+	 * Previous and following entries are both free,
+	 * merge everything into a single free entry.
+	 */
+	if (prevdup && postdup) {
+		xfs_dir2_data_free_t	*dfp2;	/* another bestfree pointer */
+
+		/*
+		 * See if prevdup and/or postdup are in bestfree table.
+		 */
+		dfp = xfs_dir2_data_freefind(d, prevdup);
+		dfp2 = xfs_dir2_data_freefind(d, postdup);
+		/*
+		 * We need a rescan unless there are exactly 2 free entries
+		 * namely our two.  Then we know what's happening, otherwise
+		 * since the third bestfree is there, there might be more
+		 * entries.
+		 */
+		needscan = !INT_ISZERO(d->hdr.bestfree[2].length, ARCH_CONVERT);
+		/*
+		 * Fix up the new big freespace.
+		 */
+		INT_MOD(prevdup->length, ARCH_CONVERT, len + INT_GET(postdup->length, ARCH_CONVERT));
+		INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(prevdup, ARCH_CONVERT), ARCH_CONVERT,
+			(xfs_dir2_data_off_t)((char *)prevdup - (char *)d));
+		xfs_dir2_data_log_unused(tp, bp, prevdup);
+		if (!needscan) {
+			/*
+			 * Has to be the case that entries 0 and 1 are
+			 * dfp and dfp2 (don't know which is which), and
+			 * entry 2 is empty.
+			 * Remove entry 1 first then entry 0.
+			 */
+			ASSERT(dfp && dfp2);
+			if (dfp == &d->hdr.bestfree[1]) {
+				dfp = &d->hdr.bestfree[0];
+				ASSERT(dfp2 == dfp);
+				dfp2 = &d->hdr.bestfree[1];
+			}
+			xfs_dir2_data_freeremove(d, dfp2, needlogp);
+			xfs_dir2_data_freeremove(d, dfp, needlogp);
+			/*
+			 * Now insert the new entry.
+			 */
+			dfp = xfs_dir2_data_freeinsert(d, prevdup, needlogp);
+			ASSERT(dfp == &d->hdr.bestfree[0]);
+			ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(prevdup->length, ARCH_CONVERT));
+			ASSERT(INT_ISZERO(dfp[1].length, ARCH_CONVERT));
+			ASSERT(INT_ISZERO(dfp[2].length, ARCH_CONVERT));
+		}
+	}
+	/*
+	 * The entry before us is free, merge with it.
+	 */
+	else if (prevdup) {
+		dfp = xfs_dir2_data_freefind(d, prevdup);
+		INT_MOD(prevdup->length, ARCH_CONVERT, len);
+		INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(prevdup, ARCH_CONVERT), ARCH_CONVERT,
+			(xfs_dir2_data_off_t)((char *)prevdup - (char *)d));
+		xfs_dir2_data_log_unused(tp, bp, prevdup);
+		/*
+		 * If the previous entry was in the table, the new entry
+		 * is longer, so it will be in the table too.  Remove
+		 * the old one and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(d, dfp, needlogp);
+			(void)xfs_dir2_data_freeinsert(d, prevdup, needlogp);
+		}
+		/*
+		 * Otherwise we need a scan if the new entry is big enough.
+		 */
+		else
+			needscan = INT_GET(prevdup->length, ARCH_CONVERT) > INT_GET(d->hdr.bestfree[2].length, ARCH_CONVERT);
+	}
+	/*
+	 * The following entry is free, merge with it.
+	 */
+	else if (postdup) {
+		dfp = xfs_dir2_data_freefind(d, postdup);
+		newdup = (xfs_dir2_data_unused_t *)((char *)d + offset);
+		INT_SET(newdup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
+		INT_SET(newdup->length, ARCH_CONVERT, len + INT_GET(postdup->length, ARCH_CONVERT));
+		INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(newdup, ARCH_CONVERT), ARCH_CONVERT,
+			(xfs_dir2_data_off_t)((char *)newdup - (char *)d));
+		xfs_dir2_data_log_unused(tp, bp, newdup);
+		/*
+		 * If the following entry was in the table, the new entry
+		 * is longer, so it will be in the table too.  Remove
+		 * the old one and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(d, dfp, needlogp);
+			(void)xfs_dir2_data_freeinsert(d, newdup, needlogp);
+		}
+		/*
+		 * Otherwise we need a scan if the new entry is big enough.
+		 */
+		else
+			needscan = INT_GET(newdup->length, ARCH_CONVERT) > INT_GET(d->hdr.bestfree[2].length, ARCH_CONVERT);
+	}
+	/*
+	 * Neither neighbor is free.  Make a new entry.
+	 */
+	else {
+		newdup = (xfs_dir2_data_unused_t *)((char *)d + offset);
+		INT_SET(newdup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
+		INT_SET(newdup->length, ARCH_CONVERT, len);
+		INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(newdup, ARCH_CONVERT), ARCH_CONVERT,
+			(xfs_dir2_data_off_t)((char *)newdup - (char *)d));
+		xfs_dir2_data_log_unused(tp, bp, newdup);
+		(void)xfs_dir2_data_freeinsert(d, newdup, needlogp);
+	}
+	*needscanp = needscan;
+}
+
+/*
+ * Take a byte range out of an existing unused space and make it un-free.
+ */
+void
+xfs_dir2_data_use_free(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp,		/* data block buffer */
+	xfs_dir2_data_unused_t	*dup,		/* unused entry */
+	xfs_dir2_data_aoff_t	offset,		/* starting offset to use */
+	xfs_dir2_data_aoff_t	len,		/* length to use */
+	int			*needlogp,	/* out: need to log header */
+	int			*needscanp)	/* out: need regen bestfree */
+{
+	xfs_dir2_data_t		*d;		/* data block */
+	xfs_dir2_data_free_t	*dfp;		/* bestfree pointer */
+	int			matchback;	/* matches end of freespace */
+	int			matchfront;	/* matches start of freespace */
+	int			needscan;	/* need to regen bestfree */
+	xfs_dir2_data_unused_t	*newdup;	/* new unused entry */
+	xfs_dir2_data_unused_t	*newdup2;	/* another new unused entry */
+	int			oldlen;		/* old unused entry's length */
+
+	d = bp->data;
+	ASSERT(INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC ||
+	       INT_GET(d->hdr.magic, ARCH_CONVERT) == XFS_DIR2_BLOCK_MAGIC);
+	ASSERT(INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG);
+	ASSERT(offset >= (char *)dup - (char *)d);
+	ASSERT(offset + len <= (char *)dup + INT_GET(dup->length, ARCH_CONVERT) - (char *)d);
+	ASSERT((char *)dup - (char *)d == INT_GET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(dup, ARCH_CONVERT), ARCH_CONVERT));
+	/*
+	 * Look up the entry in the bestfree table.
+	 */
+	dfp = xfs_dir2_data_freefind(d, dup);
+	oldlen = INT_GET(dup->length, ARCH_CONVERT);
+	ASSERT(dfp || oldlen <= INT_GET(d->hdr.bestfree[2].length, ARCH_CONVERT));
+	/*
+	 * Check for alignment with front and back of the entry.
+	 */
+	matchfront = (char *)dup - (char *)d == offset;
+	matchback = (char *)dup + oldlen - (char *)d == offset + len;
+	ASSERT(*needscanp == 0);
+	needscan = 0;
+	/*
+	 * If we matched it exactly we just need to get rid of it from
+	 * the bestfree table.
+	 */
+	if (matchfront && matchback) {
+		if (dfp) {
+			needscan = !INT_ISZERO(d->hdr.bestfree[2].offset, ARCH_CONVERT);
+			if (!needscan)
+				xfs_dir2_data_freeremove(d, dfp, needlogp);
+		}
+	}
+	/*
+	 * We match the first part of the entry.
+	 * Make a new entry with the remaining freespace.
+	 */
+	else if (matchfront) {
+		newdup = (xfs_dir2_data_unused_t *)((char *)d + offset + len);
+		INT_SET(newdup->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
+		INT_SET(newdup->length, ARCH_CONVERT, oldlen - len);
+		INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(newdup, ARCH_CONVERT), ARCH_CONVERT,
+			(xfs_dir2_data_off_t)((char *)newdup - (char *)d));
+		xfs_dir2_data_log_unused(tp, bp, newdup);
+		/*
+		 * If it was in the table, remove it and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(d, dfp, needlogp);
+			dfp = xfs_dir2_data_freeinsert(d, newdup, needlogp);
+			ASSERT(dfp != NULL);
+			ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(newdup->length, ARCH_CONVERT));
+			ASSERT(INT_GET(dfp->offset, ARCH_CONVERT) == (char *)newdup - (char *)d);
+			/*
+			 * If we got inserted at the last slot,
+			 * that means we don't know if there was a better
+			 * choice for the last slot, or not.  Rescan.
+			 */
+			needscan = dfp == &d->hdr.bestfree[2];
+		}
+	}
+	/*
+	 * We match the last part of the entry.
+	 * Trim the allocated space off the tail of the entry.
+	 */
+	else if (matchback) {
+		newdup = dup;
+		INT_SET(newdup->length, ARCH_CONVERT, (xfs_dir2_data_off_t)
+			(((char *)d + offset) - (char *)newdup));
+		INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(newdup, ARCH_CONVERT), ARCH_CONVERT,
+			(xfs_dir2_data_off_t)((char *)newdup - (char *)d));
+		xfs_dir2_data_log_unused(tp, bp, newdup);
+		/*
+		 * If it was in the table, remove it and add the new one.
+		 */
+		if (dfp) {
+			xfs_dir2_data_freeremove(d, dfp, needlogp);
+			dfp = xfs_dir2_data_freeinsert(d, newdup, needlogp);
+			ASSERT(dfp != NULL);
+			ASSERT(INT_GET(dfp->length, ARCH_CONVERT) == INT_GET(newdup->length, ARCH_CONVERT));
+			ASSERT(INT_GET(dfp->offset, ARCH_CONVERT) == (char *)newdup - (char *)d);
+			/*
+			 * If we got inserted at the last slot,
+			 * that means we don't know if there was a better
+			 * choice for the last slot, or not.  Rescan.
+			 */
+			needscan = dfp == &d->hdr.bestfree[2];
+		}
+	}
+	/*
+	 * Poking out the middle of an entry.
+	 * Make two new entries.
+	 */
+	else {
+		newdup = dup;
+		INT_SET(newdup->length, ARCH_CONVERT, (xfs_dir2_data_off_t)
+			(((char *)d + offset) - (char *)newdup));
+		INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(newdup, ARCH_CONVERT), ARCH_CONVERT,
+			(xfs_dir2_data_off_t)((char *)newdup - (char *)d));
+		xfs_dir2_data_log_unused(tp, bp, newdup);
+		newdup2 = (xfs_dir2_data_unused_t *)((char *)d + offset + len);
+		INT_SET(newdup2->freetag, ARCH_CONVERT, XFS_DIR2_DATA_FREE_TAG);
+		INT_SET(newdup2->length, ARCH_CONVERT, oldlen - len - INT_GET(newdup->length, ARCH_CONVERT));
+		INT_SET(*XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(newdup2, ARCH_CONVERT), ARCH_CONVERT,
+			(xfs_dir2_data_off_t)((char *)newdup2 - (char *)d));
+		xfs_dir2_data_log_unused(tp, bp, newdup2);
+		/*
+		 * If the old entry was in the table, we need to scan
+		 * if the 3rd entry was valid, since these entries
+		 * are smaller than the old one.
+		 * If we don't need to scan that means there were 1 or 2
+		 * entries in the table, and removing the old and adding
+		 * the 2 new will work.
+		 */
+		if (dfp) {
+			needscan = !INT_ISZERO(d->hdr.bestfree[2].length, ARCH_CONVERT);
+			if (!needscan) {
+				xfs_dir2_data_freeremove(d, dfp, needlogp);
+				(void)xfs_dir2_data_freeinsert(d, newdup,
+					needlogp);
+				(void)xfs_dir2_data_freeinsert(d, newdup2,
+					needlogp);
+			}
+		}
+	}
+	*needscanp = needscan;
+}
diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h
new file mode 100644
index 000000000000..67236b27197b
--- /dev/null
+++ b/fs/xfs/xfs_dir2_data.h
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR2_DATA_H__
+#define __XFS_DIR2_DATA_H__
+
+/*
+ * Directory format 2, data block structures.
+ */
+
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_inode;
+struct xfs_trans;
+
+/*
+ * Constants.
+ */
+#define XFS_DIR2_DATA_MAGIC	0x58443244	/* XD2D: for multiblock dirs */
+#define XFS_DIR2_DATA_ALIGN_LOG 3		/* i.e., 8 bytes */
+#define XFS_DIR2_DATA_ALIGN	(1 << XFS_DIR2_DATA_ALIGN_LOG)
+#define XFS_DIR2_DATA_FREE_TAG	0xffff
+#define XFS_DIR2_DATA_FD_COUNT	3
+
+/*
+ * Directory address space divided into sections,
+ * spaces separated by 32gb.
+ */
+#define XFS_DIR2_SPACE_SIZE	(1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
+#define XFS_DIR2_DATA_SPACE	0
+#define XFS_DIR2_DATA_OFFSET	(XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
+#define XFS_DIR2_DATA_FIRSTDB(mp)	\
+	XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_DATA_OFFSET)
+
+/*
+ * Offsets of . and .. in data space (always block 0)
+ */
+#define XFS_DIR2_DATA_DOT_OFFSET	\
+	((xfs_dir2_data_aoff_t)sizeof(xfs_dir2_data_hdr_t))
+#define XFS_DIR2_DATA_DOTDOT_OFFSET	\
+	(XFS_DIR2_DATA_DOT_OFFSET + XFS_DIR2_DATA_ENTSIZE(1))
+#define XFS_DIR2_DATA_FIRST_OFFSET		\
+	(XFS_DIR2_DATA_DOTDOT_OFFSET + XFS_DIR2_DATA_ENTSIZE(2))
+
+/*
+ * Structures.
+ */
+
+/*
+ * Describe a free area in the data block.
+ * The freespace will be formatted as a xfs_dir2_data_unused_t.
+ */
+typedef struct xfs_dir2_data_free {
+	xfs_dir2_data_off_t	offset;		/* start of freespace */
+	xfs_dir2_data_off_t	length;		/* length of freespace */
+} xfs_dir2_data_free_t;
+
+/*
+ * Header for the data blocks.
+ * Always at the beginning of a directory-sized block.
+ * The code knows that XFS_DIR2_DATA_FD_COUNT is 3.
+ */
+typedef struct xfs_dir2_data_hdr {
+	__uint32_t		magic;		/* XFS_DIR2_DATA_MAGIC */
+						/* or XFS_DIR2_BLOCK_MAGIC */
+	xfs_dir2_data_free_t	bestfree[XFS_DIR2_DATA_FD_COUNT];
+} xfs_dir2_data_hdr_t;
+
+/*
+ * Active entry in a data block.  Aligned to 8 bytes.
+ * Tag appears as the last 2 bytes.
+ */
+typedef struct xfs_dir2_data_entry {
+	xfs_ino_t		inumber;	/* inode number */
+	__uint8_t		namelen;	/* name length */
+	__uint8_t		name[1];	/* name bytes, no null */
+						/* variable offset */
+	xfs_dir2_data_off_t	tag;		/* starting offset of us */
+} xfs_dir2_data_entry_t;
+
+/*
+ * Unused entry in a data block.  Aligned to 8 bytes.
+ * Tag appears as the last 2 bytes.
+ */
+typedef struct xfs_dir2_data_unused {
+	__uint16_t		freetag;	/* XFS_DIR2_DATA_FREE_TAG */
+	xfs_dir2_data_off_t	length;		/* total free length */
+						/* variable offset */
+	xfs_dir2_data_off_t	tag;		/* starting offset of us */
+} xfs_dir2_data_unused_t;
+
+typedef union {
+	xfs_dir2_data_entry_t	entry;
+	xfs_dir2_data_unused_t	unused;
+} xfs_dir2_data_union_t;
+
+/*
+ * Generic data block structure, for xfs_db.
+ */
+typedef struct xfs_dir2_data {
+	xfs_dir2_data_hdr_t	hdr;		/* magic XFS_DIR2_DATA_MAGIC */
+	xfs_dir2_data_union_t	u[1];
+} xfs_dir2_data_t;
+
+/*
+ * Macros.
+ */
+
+/*
+ * Size of a data entry.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATA_ENTSIZE)
+int xfs_dir2_data_entsize(int n);
+#define XFS_DIR2_DATA_ENTSIZE(n)	xfs_dir2_data_entsize(n)
+#else
+#define XFS_DIR2_DATA_ENTSIZE(n)	\
+	((int)(roundup(offsetof(xfs_dir2_data_entry_t, name[0]) + (n) + \
+		 (uint)sizeof(xfs_dir2_data_off_t), XFS_DIR2_DATA_ALIGN)))
+#endif
+
+/*
+ * Pointer to an entry's tag word.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATA_ENTRY_TAG_P)
+xfs_dir2_data_off_t *xfs_dir2_data_entry_tag_p(xfs_dir2_data_entry_t *dep);
+#define XFS_DIR2_DATA_ENTRY_TAG_P(dep)	xfs_dir2_data_entry_tag_p(dep)
+#else
+#define XFS_DIR2_DATA_ENTRY_TAG_P(dep)	\
+	((xfs_dir2_data_off_t *)\
+	 ((char *)(dep) + XFS_DIR2_DATA_ENTSIZE((dep)->namelen) - \
+	  (uint)sizeof(xfs_dir2_data_off_t)))
+#endif
+
+/*
+ * Pointer to a freespace's tag word.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATA_UNUSED_TAG_P)
+xfs_dir2_data_off_t *xfs_dir2_data_unused_tag_p_arch(
+	xfs_dir2_data_unused_t *dup, xfs_arch_t arch);
+#define XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(dup,arch) \
+	xfs_dir2_data_unused_tag_p_arch(dup,arch)
+#else
+#define XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(dup,arch)	\
+	((xfs_dir2_data_off_t *)\
+	 ((char *)(dup) + INT_GET((dup)->length, arch) \
+			- (uint)sizeof(xfs_dir2_data_off_t)))
+#endif
+
+/*
+ * Function declarations.
+ */
+
+#ifdef DEBUG
+extern void
+	xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_dabuf *bp);
+#else
+#define xfs_dir2_data_check(dp,bp)
+#endif
+
+extern xfs_dir2_data_free_t *
+	xfs_dir2_data_freefind(xfs_dir2_data_t *d,
+			       xfs_dir2_data_unused_t *dup);
+
+extern xfs_dir2_data_free_t *
+	xfs_dir2_data_freeinsert(xfs_dir2_data_t *d,
+				 xfs_dir2_data_unused_t *dup, int *loghead);
+
+extern void
+	xfs_dir2_data_freeremove(xfs_dir2_data_t *d,
+				 xfs_dir2_data_free_t *dfp, int *loghead);
+
+extern void
+	xfs_dir2_data_freescan(struct xfs_mount *mp, xfs_dir2_data_t *d,
+			       int *loghead, char *aendp);
+
+extern int
+	xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
+			   struct xfs_dabuf **bpp);
+
+extern void
+	xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_dabuf *bp,
+				xfs_dir2_data_entry_t *dep);
+
+extern void
+	xfs_dir2_data_log_header(struct xfs_trans *tp, struct xfs_dabuf *bp);
+
+extern void
+	xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_dabuf *bp,
+				 xfs_dir2_data_unused_t *dup);
+
+extern void
+	xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_dabuf *bp,
+				xfs_dir2_data_aoff_t offset,
+				xfs_dir2_data_aoff_t len, int *needlogp,
+				int *needscanp);
+
+extern void
+	xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_dabuf *bp,
+			       xfs_dir2_data_unused_t *dup,
+			       xfs_dir2_data_aoff_t offset,
+			       xfs_dir2_data_aoff_t len, int *needlogp,
+			       int *needscanp);
+
+#endif	/* __XFS_DIR2_DATA_H__ */
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
new file mode 100644
index 000000000000..c201111f7339
--- /dev/null
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -0,0 +1,1873 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * xfs_dir2_leaf.c
+ * XFS directory version 2 implementation - single leaf form
+ * see xfs_dir2_leaf.h for data structures.
+ * These directories have multiple XFS_DIR2_DATA blocks and one
+ * XFS_DIR2_LEAF1 block containing the hash table and freespace map.
+ */
+
+#include <xfs.h>
+
+/*
+ * Local function declarations.
+ */
+#ifdef DEBUG
+static void xfs_dir2_leaf_check(xfs_inode_t *dp, xfs_dabuf_t *bp);
+#else
+#define xfs_dir2_leaf_check(dp, bp)
+#endif
+static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **lbpp,
+				    int *indexp, xfs_dabuf_t **dbpp);
+
+/*
+ * Convert a block form directory to a leaf form directory.
+ */
+int						/* error */
+xfs_dir2_block_to_leaf(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		*dbp)		/* input block's buffer */
+{
+	xfs_dir2_data_off_t	*bestsp;	/* leaf's bestsp entries */
+	xfs_dablk_t		blkno;		/* leaf block's bno */
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_leaf_entry_t	*blp;		/* block's leaf entries */
+	xfs_dir2_block_tail_t	*btp;		/* block's tail */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	xfs_dabuf_t		*lbp;		/* leaf block's buffer */
+	xfs_dir2_db_t		ldb;		/* leaf block's bno */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf's tail */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log block header */
+	int			needscan;	/* need to rescan bestfree */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args_b("block_to_leaf", args, dbp);
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	/*
+	 * Add the leaf block to the inode.
+	 * This interface will only put blocks in the leaf/node range.
+	 * Since that's empty now, we'll get the root (block 0 in range).
+	 */
+	if ((error = xfs_da_grow_inode(args, &blkno))) {
+		return error;
+	}
+	ldb = XFS_DIR2_DA_TO_DB(mp, blkno);
+	ASSERT(ldb == XFS_DIR2_LEAF_FIRSTDB(mp));
+	/*
+	 * Initialize the leaf block, get a buffer for it.
+	 */
+	if ((error = xfs_dir2_leaf_init(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC))) {
+		return error;
+	}
+	ASSERT(lbp != NULL);
+	leaf = lbp->data;
+	block = dbp->data;
+	xfs_dir2_data_check(dp, dbp);
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	/*
+	 * Set the counts in the leaf header.
+	 */
+	INT_COPY(leaf->hdr.count, btp->count, ARCH_CONVERT); /* INT_: type change */
+	INT_COPY(leaf->hdr.stale, btp->stale, ARCH_CONVERT); /* INT_: type change */
+	/*
+	 * Could compact these but I think we always do the conversion
+	 * after squeezing out stale entries.
+	 */
+	bcopy(blp, leaf->ents, INT_GET(btp->count, ARCH_CONVERT) * sizeof(xfs_dir2_leaf_entry_t));
+	xfs_dir2_leaf_log_ents(tp, lbp, 0, INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1);
+	needscan = 0;
+	needlog = 1;
+	/*
+	 * Make the space formerly occupied by the leaf entries and block
+	 * tail be free.
+	 */
+	xfs_dir2_data_make_free(tp, dbp,
+		(xfs_dir2_data_aoff_t)((char *)blp - (char *)block),
+		(xfs_dir2_data_aoff_t)((char *)block + mp->m_dirblksize -
+				       (char *)blp),
+		&needlog, &needscan);
+	/*
+	 * Fix up the block header, make it a data block.
+	 */
+	INT_SET(block->hdr.magic, ARCH_CONVERT, XFS_DIR2_DATA_MAGIC);
+	if (needscan)
+		xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog,
+			NULL);
+	/*
+	 * Set up leaf tail and bests table.
+	 */
+	ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+	INT_SET(ltp->bestcount, ARCH_CONVERT, 1);
+	bestsp = XFS_DIR2_LEAF_BESTS_P_ARCH(ltp, ARCH_CONVERT);
+	INT_COPY(bestsp[0], block->hdr.bestfree[0].length, ARCH_CONVERT);
+	/*
+	 * Log the data header and leaf bests table.
+	 */
+	if (needlog)
+		xfs_dir2_data_log_header(tp, dbp);
+	xfs_dir2_leaf_check(dp, lbp);
+	xfs_dir2_data_check(dp, dbp);
+	xfs_dir2_leaf_log_bests(tp, lbp, 0, 0);
+	xfs_da_buf_done(lbp);
+	return 0;
+}
+
+/*
+ * Add an entry to a leaf form directory.
+ */
+int						/* error */
+xfs_dir2_leaf_addname(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_dir2_data_off_t	*bestsp;	/* freespace table in leaf */
+	int			compact;	/* need to compact leaves */
+	xfs_dir2_data_t		*data;		/* data block structure */
+	xfs_dabuf_t		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* data unused entry */
+	int			error;		/* error return value */
+	int			grown;		/* allocated new data block */
+	int			highstale;	/* index of next stale leaf */
+	int			i;		/* temporary, index */
+	int			index;		/* leaf table position */
+	xfs_dabuf_t		*lbp;		/* leaf's buffer */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	int			length;		/* length of new entry */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry table pointer */
+	int			lfloglow;	/* low leaf logging index */
+	int			lfloghigh;	/* high leaf logging index */
+	int			lowstale;	/* index of prev stale leaf */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail pointer */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needbytes;	/* leaf block bytes needed */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to rescan data free */
+	xfs_dir2_data_off_t	*tagp;		/* end of data entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+	xfs_dir2_db_t		use_block;	/* data block number */
+
+	xfs_dir2_trace_args("leaf_addname", args);
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	/*
+	 * Read the leaf block.
+	 */
+	error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
+		XFS_DATA_FORK);
+	if (error) {
+		return error;
+	}
+	ASSERT(lbp != NULL);
+	/*
+	 * Look up the entry by hash value and name.
+	 * We know it's not there, our caller has already done a lookup.
+	 * So the index is of the entry to insert in front of.
+	 * But if there are dup hash values the index is of the first of those.
+	 */
+	index = xfs_dir2_leaf_search_hash(args, lbp);
+	leaf = lbp->data;
+	ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+	bestsp = XFS_DIR2_LEAF_BESTS_P_ARCH(ltp, ARCH_CONVERT);
+	length = XFS_DIR2_DATA_ENTSIZE(args->namelen);
+	/*
+	 * See if there are any entries with the same hash value
+	 * and space in their block for the new entry.
+	 * This is good because it puts multiple same-hash value entries
+	 * in a data block, improving the lookup of those entries.
+	 */
+	for (use_block = -1, lep = &leaf->ents[index];
+	     index < INT_GET(leaf->hdr.count, ARCH_CONVERT) && INT_GET(lep->hashval, ARCH_CONVERT) == args->hashval;
+	     index++, lep++) {
+		if (INT_GET(lep->address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		i = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
+		ASSERT(i < INT_GET(ltp->bestcount, ARCH_CONVERT));
+		ASSERT(INT_GET(bestsp[i], ARCH_CONVERT) != NULLDATAOFF);
+		if (INT_GET(bestsp[i], ARCH_CONVERT) >= length) {
+			use_block = i;
+			break;
+		}
+	}
+	/*
+	 * Didn't find a block yet, linear search all the data blocks.
+	 */
+	if (use_block == -1) {
+		for (i = 0; i < INT_GET(ltp->bestcount, ARCH_CONVERT); i++) {
+			/*
+			 * Remember a block we see that's missing.
+			 */
+			if (INT_GET(bestsp[i], ARCH_CONVERT) == NULLDATAOFF && use_block == -1)
+				use_block = i;
+			else if (INT_GET(bestsp[i], ARCH_CONVERT) >= length) {
+				use_block = i;
+				break;
+			}
+		}
+	}
+	/*
+	 * How many bytes do we need in the leaf block?
+	 */
+	needbytes =
+		(!INT_ISZERO(leaf->hdr.stale, ARCH_CONVERT) ? 0 : (uint)sizeof(leaf->ents[0])) +
+		(use_block != -1 ? 0 : (uint)sizeof(leaf->bests[0]));
+	/*
+	 * Now kill use_block if it refers to a missing block, so we
+	 * can use it as an indication of allocation needed.
+	 */
+	if (use_block != -1 && INT_GET(bestsp[use_block], ARCH_CONVERT) == NULLDATAOFF)
+		use_block = -1;
+	/*
+	 * If we don't have enough free bytes but we can make enough
+	 * by compacting out stale entries, we'll do that.
+	 */
+	if ((char *)bestsp - (char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] < needbytes &&
+	    INT_GET(leaf->hdr.stale, ARCH_CONVERT) > 1) {
+		compact = 1;
+	}
+	/*
+	 * Otherwise if we don't have enough free bytes we need to
+	 * convert to node form.
+	 */
+	else if ((char *)bestsp - (char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] <
+		 needbytes) {
+		/*
+		 * Just checking or no space reservation, give up.
+		 */
+		if (args->justcheck || args->total == 0) {
+			xfs_da_brelse(tp, lbp);
+			return XFS_ERROR(ENOSPC);
+		}
+		/*
+		 * Convert to node form.
+		 */
+		error = xfs_dir2_leaf_to_node(args, lbp);
+		xfs_da_buf_done(lbp);
+		if (error)
+			return error;
+		/*
+		 * Then add the new entry.
+		 */
+		return xfs_dir2_node_addname(args);
+	}
+	/*
+	 * Otherwise it will fit without compaction.
+	 */
+	else
+		compact = 0;
+	/*
+	 * If just checking, then it will fit unless we needed to allocate
+	 * a new data block.
+	 */
+	if (args->justcheck) {
+		xfs_da_brelse(tp, lbp);
+		return use_block == -1 ? XFS_ERROR(ENOSPC) : 0;
+	}
+	/*
+	 * If no allocations are allowed, return now before we've
+	 * changed anything.
+	 */
+	if (args->total == 0 && use_block == -1) {
+		xfs_da_brelse(tp, lbp);
+		return XFS_ERROR(ENOSPC);
+	}
+	/*
+	 * Need to compact the leaf entries, removing stale ones.
+	 * Leave one stale entry behind - the one closest to our
+	 * insertion index - and we'll shift that one to our insertion
+	 * point later.
+	 */
+	if (compact) {
+		xfs_dir2_leaf_compact_x1(lbp, &index, &lowstale, &highstale,
+			&lfloglow, &lfloghigh);
+	}
+	/*
+	 * There are stale entries, so we'll need log-low and log-high
+	 * impossibly bad values later.
+	 */
+	else if (INT_GET(leaf->hdr.stale, ARCH_CONVERT)) {
+		lfloglow = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+		lfloghigh = -1;
+	}
+	/*
+	 * If there was no data block space found, we need to allocate
+	 * a new one.
+	 */
+	if (use_block == -1) {
+		/*
+		 * Add the new data block.
+		 */
+		if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE,
+				&use_block))) {
+			xfs_da_brelse(tp, lbp);
+			return error;
+		}
+		/*
+		 * Initialize the block.
+		 */
+		if ((error = xfs_dir2_data_init(args, use_block, &dbp))) {
+			xfs_da_brelse(tp, lbp);
+			return error;
+		}
+		/*
+		 * If we're adding a new data block on the end we need to
+		 * extend the bests table.  Copy it up one entry.
+		 */
+		if (use_block >= INT_GET(ltp->bestcount, ARCH_CONVERT)) {
+			bestsp--;
+			ovbcopy(&bestsp[1], &bestsp[0],
+				INT_GET(ltp->bestcount, ARCH_CONVERT) * sizeof(bestsp[0]));
+			INT_MOD(ltp->bestcount, ARCH_CONVERT, +1);
+			xfs_dir2_leaf_log_tail(tp, lbp);
+			xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
+		}
+		/*
+		 * If we're filling in a previously empty block just log it.
+		 */
+		else
+			xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block);
+		data = dbp->data;
+		INT_COPY(bestsp[use_block], data->hdr.bestfree[0].length, ARCH_CONVERT);
+		grown = 1;
+	}
+	/*
+	 * Already had space in some data block.
+	 * Just read that one in.
+	 */
+	else {
+		if ((error =
+		    xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, use_block),
+			    -1, &dbp, XFS_DATA_FORK))) {
+			xfs_da_brelse(tp, lbp);
+			return error;
+		}
+		data = dbp->data;
+		grown = 0;
+	}
+	xfs_dir2_data_check(dp, dbp);
+	/*
+	 * Point to the biggest freespace in our data block.
+	 */
+	dup = (xfs_dir2_data_unused_t *)
+	      ((char *)data + INT_GET(data->hdr.bestfree[0].offset, ARCH_CONVERT));
+	ASSERT(INT_GET(dup->length, ARCH_CONVERT) >= length);
+	needscan = needlog = 0;
+	/*
+	 * Mark the initial part of our freespace in use for the new entry.
+	 */
+	xfs_dir2_data_use_free(tp, dbp, dup,
+		(xfs_dir2_data_aoff_t)((char *)dup - (char *)data), length,
+		&needlog, &needscan);
+	/*
+	 * Initialize our new entry (at last).
+	 */
+	dep = (xfs_dir2_data_entry_t *)dup;
+	INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+	dep->namelen = args->namelen;
+	bcopy(args->name, dep->name, dep->namelen);
+	tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
+	INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)data));
+	/*
+	 * Need to scan fix up the bestfree table.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(mp, data, &needlog, NULL);
+	/*
+	 * Need to log the data block's header.
+	 */
+	if (needlog)
+		xfs_dir2_data_log_header(tp, dbp);
+	xfs_dir2_data_log_entry(tp, dbp, dep);
+	/*
+	 * If the bests table needs to be changed, do it.
+	 * Log the change unless we've already done that.
+	 */
+	if (INT_GET(bestsp[use_block], ARCH_CONVERT) != INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT)) {
+		INT_COPY(bestsp[use_block], data->hdr.bestfree[0].length, ARCH_CONVERT);
+		if (!grown)
+			xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block);
+	}
+	/*
+	 * Now we need to make room to insert the leaf entry.
+	 * If there are no stale entries, we just insert a hole at index.
+	 */
+	if (INT_ISZERO(leaf->hdr.stale, ARCH_CONVERT)) {
+		/*
+		 * lep is still good as the index leaf entry.
+		 */
+		if (index < INT_GET(leaf->hdr.count, ARCH_CONVERT))
+			ovbcopy(lep, lep + 1,
+				(INT_GET(leaf->hdr.count, ARCH_CONVERT) - index) * sizeof(*lep));
+		/*
+		 * Record low and high logging indices for the leaf.
+		 */
+		lfloglow = index;
+		lfloghigh = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+		INT_MOD(leaf->hdr.count, ARCH_CONVERT, +1);
+	}
+	/*
+	 * There are stale entries.
+	 * We will use one of them for the new entry.
+	 * It's probably not at the right location, so we'll have to
+	 * shift some up or down first.
+	 */
+	else {
+		/*
+		 * If we didn't compact before, we need to find the nearest
+		 * stale entries before and after our insertion point.
+		 */
+		if (compact == 0) {
+			/*
+			 * Find the first stale entry before the insertion
+			 * point, if any.
+			 */
+			for (lowstale = index - 1;
+			     lowstale >= 0 &&
+				INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) !=
+				XFS_DIR2_NULL_DATAPTR;
+			     lowstale--)
+				continue;
+			/*
+			 * Find the next stale entry at or after the insertion
+			 * point, if any.   Stop if we go so far that the
+			 * lowstale entry would be better.
+			 */
+			for (highstale = index;
+			     highstale < INT_GET(leaf->hdr.count, ARCH_CONVERT) &&
+				INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) !=
+				XFS_DIR2_NULL_DATAPTR &&
+				(lowstale < 0 ||
+				 index - lowstale - 1 >= highstale - index);
+			     highstale++)
+				continue;
+		}
+		/*
+		 * If the low one is better, use it.
+		 */
+		if (lowstale >= 0 &&
+		    (highstale == INT_GET(leaf->hdr.count, ARCH_CONVERT) ||
+		     index - lowstale - 1 < highstale - index)) {
+			ASSERT(index - lowstale - 1 >= 0);
+			ASSERT(INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) ==
+			       XFS_DIR2_NULL_DATAPTR);
+			/*
+			 * Copy entries up to cover the stale entry
+			 * and make room for the new entry.
+			 */
+			if (index - lowstale - 1 > 0)
+				ovbcopy(&leaf->ents[lowstale + 1],
+					&leaf->ents[lowstale],
+					(index - lowstale - 1) * sizeof(*lep));
+			lep = &leaf->ents[index - 1];
+			lfloglow = MIN(lowstale, lfloglow);
+			lfloghigh = MAX(index - 1, lfloghigh);
+		}
+		/*
+		 * The high one is better, so use that one.
+		 */
+		else {
+			ASSERT(highstale - index >= 0);
+			ASSERT(INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) ==
+			       XFS_DIR2_NULL_DATAPTR);
+			/*
+			 * Copy entries down to copver the stale entry
+			 * and make room for the new entry.
+			 */
+			if (highstale - index > 0)
+				ovbcopy(&leaf->ents[index],
+					&leaf->ents[index + 1],
+					(highstale - index) * sizeof(*lep));
+			lep = &leaf->ents[index];
+			lfloglow = MIN(index, lfloglow);
+			lfloghigh = MAX(highstale, lfloghigh);
+		}
+		INT_MOD(leaf->hdr.stale, ARCH_CONVERT, -1);
+	}
+	/*
+	 * Fill in the new leaf entry.
+	 */
+	INT_SET(lep->hashval, ARCH_CONVERT, args->hashval);
+	INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_DB_OFF_TO_DATAPTR(mp, use_block, INT_GET(*tagp, ARCH_CONVERT)));
+	/*
+	 * Log the leaf fields and give up the buffers.
+	 */
+	xfs_dir2_leaf_log_header(tp, lbp);
+	xfs_dir2_leaf_log_ents(tp, lbp, lfloglow, lfloghigh);
+	xfs_dir2_leaf_check(dp, lbp);
+	xfs_da_buf_done(lbp);
+	xfs_dir2_data_check(dp, dbp);
+	xfs_da_buf_done(dbp);
+	return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check the internal consistency of a leaf1 block.
+ * Pop an assert if something is wrong.
+ */
+void
+xfs_dir2_leaf_check(
+	xfs_inode_t		*dp,		/* incore directory inode */
+	xfs_dabuf_t		*bp)		/* leaf's buffer */
+{
+	int			i;		/* leaf index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail pointer */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			stale;		/* count of stale leaves */
+
+	leaf = bp->data;
+	mp = dp->i_mount;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC);
+	/*
+	 * This value is not restrictive enough.
+	 * Should factor in the size of the bests table as well.
+	 * We can deduce a value for that from di_size.
+	 */
+	ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) <= XFS_DIR2_MAX_LEAF_ENTS(mp));
+	ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+	/*
+	 * Leaves and bests don't overlap.
+	 */
+	ASSERT((char *)&leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT)] <=
+	       (char *)XFS_DIR2_LEAF_BESTS_P_ARCH(ltp, ARCH_CONVERT));
+	/*
+	 * Check hash value order, count stale entries.
+	 */
+	for (i = stale = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); i++) {
+		if (i + 1 < INT_GET(leaf->hdr.count, ARCH_CONVERT))
+			ASSERT(INT_GET(leaf->ents[i].hashval, ARCH_CONVERT) <=
+			       INT_GET(leaf->ents[i + 1].hashval, ARCH_CONVERT));
+		if (INT_GET(leaf->ents[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+			stale++;
+	}
+	ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) == stale);
+}
+#endif	/* DEBUG */
+
+/*
+ * Compact out any stale entries in the leaf.
+ * Log the header and changed leaf entries, if any.
+ */
+void
+xfs_dir2_leaf_compact(
+	xfs_da_args_t	*args,		/* operation arguments */
+	xfs_dabuf_t	*bp)		/* leaf buffer */
+{
+	int		from;		/* source leaf index */
+	xfs_dir2_leaf_t *leaf;		/* leaf structure */
+	int		loglow;		/* first leaf entry to log */
+	int		to;		/* target leaf index */
+
+	leaf = bp->data;
+	if (INT_ISZERO(leaf->hdr.stale, ARCH_CONVERT)) {
+		return;
+	}
+	/*
+	 * Compress out the stale entries in place.
+	 */
+	for (from = to = 0, loglow = -1; from < INT_GET(leaf->hdr.count, ARCH_CONVERT); from++) {
+		if (INT_GET(leaf->ents[from].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Only actually copy the entries that are different.
+		 */
+		if (from > to) {
+			if (loglow == -1)
+				loglow = to;
+			leaf->ents[to] = leaf->ents[from];
+		}
+		to++;
+	}
+	/*
+	 * Update and log the header, log the leaf entries.
+	 */
+	ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) == from - to);
+	INT_MOD(leaf->hdr.count, ARCH_CONVERT, -(INT_GET(leaf->hdr.stale, ARCH_CONVERT)));
+	INT_ZERO(leaf->hdr.stale, ARCH_CONVERT);
+	xfs_dir2_leaf_log_header(args->trans, bp);
+	if (loglow != -1)
+		xfs_dir2_leaf_log_ents(args->trans, bp, loglow, to - 1);
+}
+
+/*
+ * Compact the leaf entries, removing stale ones.
+ * Leave one stale entry behind - the one closest to our
+ * insertion index - and the caller will shift that one to our insertion
+ * point later.
+ * Return new insertion index, where the remaining stale entry is,
+ * and leaf logging indices.
+ */
+void
+xfs_dir2_leaf_compact_x1(
+	xfs_dabuf_t	*bp,		/* leaf buffer */
+	int		*indexp,	/* insertion index */
+	int		*lowstalep,	/* out: stale entry before us */
+	int		*highstalep,	/* out: stale entry after us */
+	int		*lowlogp,	/* out: low log index */
+	int		*highlogp)	/* out: high log index */
+{
+	int		from;		/* source copy index */
+	int		highstale;	/* stale entry at/after index */
+	int		index;		/* insertion index */
+	int		keepstale;	/* source index of kept stale */
+	xfs_dir2_leaf_t *leaf;		/* leaf structure */
+	int		lowstale;	/* stale entry before index */
+	int		newindex=0;	/* new insertion index */
+	int		to;		/* destination copy index */
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) > 1);
+	index = *indexp;
+	/*
+	 * Find the first stale entry before our index, if any.
+	 */
+	for (lowstale = index - 1;
+	     lowstale >= 0 &&
+		INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR;
+	     lowstale--)
+		continue;
+	/*
+	 * Find the first stale entry at or after our index, if any.
+	 * Stop if the answer would be worse than lowstale.
+	 */
+	for (highstale = index;
+	     highstale < INT_GET(leaf->hdr.count, ARCH_CONVERT) &&
+		INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) != XFS_DIR2_NULL_DATAPTR &&
+		(lowstale < 0 || index - lowstale > highstale - index);
+	     highstale++)
+		continue;
+	/*
+	 * Pick the better of lowstale and highstale.
+	 */
+	if (lowstale >= 0 &&
+	    (highstale == INT_GET(leaf->hdr.count, ARCH_CONVERT) ||
+	     index - lowstale <= highstale - index))
+		keepstale = lowstale;
+	else
+		keepstale = highstale;
+	/*
+	 * Copy the entries in place, removing all the stale entries
+	 * except keepstale.
+	 */
+	for (from = to = 0; from < INT_GET(leaf->hdr.count, ARCH_CONVERT); from++) {
+		/*
+		 * Notice the new value of index.
+		 */
+		if (index == from)
+			newindex = to;
+		if (from != keepstale &&
+		    INT_GET(leaf->ents[from].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR) {
+			if (from == to)
+				*lowlogp = to;
+			continue;
+		}
+		/*
+		 * Record the new keepstale value for the insertion.
+		 */
+		if (from == keepstale)
+			lowstale = highstale = to;
+		/*
+		 * Copy only the entries that have moved.
+		 */
+		if (from > to)
+			leaf->ents[to] = leaf->ents[from];
+		to++;
+	}
+	ASSERT(from > to);
+	/*
+	 * If the insertion point was past the last entry,
+	 * set the new insertion point accordingly.
+	 */
+	if (index == from)
+		newindex = to;
+	*indexp = newindex;
+	/*
+	 * Adjust the leaf header values.
+	 */
+	INT_MOD(leaf->hdr.count, ARCH_CONVERT, -(from - to));
+	INT_SET(leaf->hdr.stale, ARCH_CONVERT, 1);
+	/*
+	 * Remember the low/high stale value only in the "right"
+	 * direction.
+	 */
+	if (lowstale >= newindex)
+		lowstale = -1;
+	else
+		highstale = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+	*highlogp = INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1;
+	*lowstalep = lowstale;
+	*highstalep = highstale;
+}
+
+/*
+ * Getdents (readdir) for leaf and node directories.
+ * This reads the data blocks only, so is the same for both forms.
+ */
+int						/* error */
+xfs_dir2_leaf_getdents(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_inode_t		*dp,		/* incore directory inode */
+	uio_t			*uio,		/* I/O control & vectors */
+	int			*eofp,		/* out: reached end of dir */
+	xfs_dirent_t		*dbp,		/* caller's buffer */
+	xfs_dir2_put_t		put)		/* ABI formatting routine */
+{
+	xfs_dabuf_t		*bp;		/* data block buffer */
+	int			byteoff;	/* offset in current block */
+	xfs_dir2_db_t		curdb;		/* db for current block */
+	xfs_dir2_off_t		curoff;		/* current overall offset */
+	xfs_dir2_data_t		*data;		/* data block structure */
+	xfs_dir2_data_entry_t	*dep;		/* data entry */
+	xfs_dir2_data_unused_t	*dup;		/* unused entry */
+	int			eof;		/* reached end of directory */
+	int			error=0;		/* error return value */
+	int			i;		/* temporary loop index */
+	int			j;		/* temporary loop index */
+	int			length;		/* temporary length value */
+	xfs_bmbt_irec_t		*map;		/* map vector for blocks */
+	xfs_extlen_t		map_blocks;	/* number of fsbs in map */
+	xfs_dablk_t		map_off;	/* last mapped file offset */
+	int			map_size;	/* total entries in *map */
+	int			map_valid;	/* valid entries in *map */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_off_t		newoff;		/* new curoff after new blk */
+	int			nmap;		/* mappings to ask xfs_bmapi */
+	xfs_dir2_put_args_t	p;		/* formatting arg bundle */
+	char			*ptr=NULL;		/* pointer to current data */
+	int			ra_current;	/* number of read-ahead blks */
+	int			ra_index;	/* *map index for read-ahead */
+	int			ra_offset;	/* map entry offset for ra */
+	int			ra_want;	/* readahead count wanted */
+
+	/*
+	 * If the offset is at or past the largest allowed value,
+	 * give up right away, return eof.
+	 */
+	if (uio->uio_offset >= XFS_DIR2_MAX_DATAPTR) {
+		*eofp = 1;
+		return 0;
+	}
+	mp = dp->i_mount;
+	/*
+	 * Setup formatting arguments.
+	 */
+	p.dbp = dbp;
+	p.put = put;
+	p.uio = uio;
+	/*
+	 * Set up to bmap a number of blocks based on the caller's
+	 * buffer size, the directory block size, and the filesystem
+	 * block size.
+	 */
+	map_size =
+		howmany(uio->uio_resid + mp->m_dirblksize,
+			mp->m_sb.sb_blocksize);
+	map = kmem_alloc(map_size * sizeof(*map), KM_SLEEP);
+	map_valid = ra_index = ra_offset = ra_current = map_blocks = 0;
+	bp = NULL;
+	eof = 1;
+	/*
+	 * Inside the loop we keep the main offset value as a byte offset
+	 * in the directory file.
+	 */
+	curoff = XFS_DIR2_DATAPTR_TO_BYTE(mp, uio->uio_offset);
+	/*
+	 * Force this conversion through db so we truncate the offset
+	 * down to get the start of the data block.
+	 */
+	map_off = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_BYTE_TO_DB(mp, curoff));
+	/*
+	 * Loop over directory entries until we reach the end offset.
+	 * Get more blocks and readahead as necessary.
+	 */
+	while (curoff < XFS_DIR2_LEAF_OFFSET) {
+		/*
+		 * If we have no buffer, or we're off the end of the
+		 * current buffer, need to get another one.
+		 */
+		if (!bp || ptr >= (char *)bp->data + mp->m_dirblksize) {
+			/*
+			 * If we have a buffer, we need to release it and
+			 * take it out of the mapping.
+			 */
+			if (bp) {
+				xfs_da_brelse(tp, bp);
+				bp = NULL;
+				map_blocks -= mp->m_dirblkfsbs;
+				/*
+				 * Loop to get rid of the extents for the
+				 * directory block.
+				 */
+				for (i = mp->m_dirblkfsbs; i > 0; ) {
+					j = MIN((int)map->br_blockcount, i);
+					map->br_blockcount -= j;
+					map->br_startblock += j;
+					map->br_startoff += j;
+					/*
+					 * If mapping is done, pitch it from
+					 * the table.
+					 */
+					if (!map->br_blockcount && --map_valid)
+						ovbcopy(&map[1], &map[0],
+							sizeof(map[0]) *
+							map_valid);
+					i -= j;
+				}
+			}
+			/*
+			 * Recalculate the readahead blocks wanted.
+			 */
+			ra_want = howmany(uio->uio_resid + mp->m_dirblksize,
+					  mp->m_sb.sb_blocksize) - 1;
+			/*
+			 * If we don't have as many as we want, and we haven't
+			 * run out of data blocks, get some more mappings.
+			 */
+			if (1 + ra_want > map_blocks &&
+			    map_off <
+			    XFS_DIR2_BYTE_TO_DA(mp, XFS_DIR2_LEAF_OFFSET)) {
+				/*
+				 * Get more bmaps, fill in after the ones
+				 * we already have in the table.
+				 */
+				nmap = map_size - map_valid;
+				error = xfs_bmapi(tp, dp,
+					map_off,
+					XFS_DIR2_BYTE_TO_DA(mp,
+						XFS_DIR2_LEAF_OFFSET) - map_off,
+					XFS_BMAPI_METADATA, NULL, 0,
+					&map[map_valid], &nmap, NULL);
+				/*
+				 * Don't know if we should ignore this or
+				 * try to return an error.
+				 * The trouble with returning errors
+				 * is that readdir will just stop without
+				 * actually passing the error through.
+				 */
+				if (error)
+					break;	/* XXX */
+				/*
+				 * If we got all the mappings we asked for,
+				 * set the final map offset based on the
+				 * last bmap value received.
+				 * Otherwise, we've reached the end.
+				 */
+				if (nmap == map_size - map_valid)
+					map_off =
+					map[map_valid + nmap - 1].br_startoff +
+					map[map_valid + nmap - 1].br_blockcount;
+				else
+					map_off =
+						XFS_DIR2_BYTE_TO_DA(mp,
+							XFS_DIR2_LEAF_OFFSET);
+				/*
+				 * Look for holes in the mapping, and
+				 * eliminate them.  Count up the valid blocks.
+				 */
+				for (i = map_valid; i < map_valid + nmap; ) {
+					if (map[i].br_startblock ==
+					    HOLESTARTBLOCK) {
+						nmap--;
+						length = map_valid + nmap - i;
+						if (length)
+							ovbcopy(&map[i + 1],
+								&map[i],
+								sizeof(map[i]) *
+								length);
+					} else {
+						map_blocks +=
+							map[i].br_blockcount;
+						i++;
+					}
+				}
+				map_valid += nmap;
+			}
+			/*
+			 * No valid mappings, so no more data blocks.
+			 */
+			if (!map_valid) {
+				curoff = XFS_DIR2_DA_TO_BYTE(mp, map_off);
+				break;
+			}
+			/*
+			 * Read the directory block starting at the first
+			 * mapping.
+			 */
+			curdb = XFS_DIR2_DA_TO_DB(mp, map->br_startoff);
+			error = xfs_da_read_buf(tp, dp, map->br_startoff,
+				map->br_blockcount >= mp->m_dirblkfsbs ?
+				    XFS_FSB_TO_DADDR(mp, map->br_startblock) :
+				    -1,
+				&bp, XFS_DATA_FORK);
+			/*
+			 * Should just skip over the data block instead
+			 * of giving up.
+			 */
+			if (error)
+				break;	/* XXX */
+			/*
+			 * Adjust the current amount of read-ahead: we just
+			 * read a block that was previously ra.
+			 */
+			if (ra_current)
+				ra_current -= mp->m_dirblkfsbs;
+			/*
+			 * Do we need more readahead?
+			 */
+			for (ra_index = ra_offset = i = 0;
+			     ra_want > ra_current && i < map_blocks;
+			     i += mp->m_dirblkfsbs) {
+				ASSERT(ra_index < map_valid);
+				/*
+				 * Read-ahead a contiguous directory block.
+				 */
+				if (i > ra_current &&
+				    map[ra_index].br_blockcount >=
+				    mp->m_dirblkfsbs) {
+					xfs_baread(mp->m_ddev_targp,
+						XFS_FSB_TO_DADDR(mp,
+						   map[ra_index].br_startblock +
+						   ra_offset),
+						(int)BTOBB(mp->m_dirblksize));
+					ra_current = i;
+				}
+				/*
+				 * Read-ahead a non-contiguous directory block.
+				 * This doesn't use our mapping, but this
+				 * is a very rare case.
+				 */
+				else if (i > ra_current) {
+					(void)xfs_da_reada_buf(tp, dp,
+						map[ra_index].br_startoff +
+						ra_offset, XFS_DATA_FORK);
+					ra_current = i;
+				}
+				/*
+				 * Advance offset through the mapping table.
+				 */
+				for (j = 0; j < mp->m_dirblkfsbs; j++) {
+					/*
+					 * The rest of this extent but not
+					 * more than a dir block.
+					 */
+					length = MIN(mp->m_dirblkfsbs,
+						(int)(map[ra_index].br_blockcount -
+						ra_offset));
+					j += length;
+					ra_offset += length;
+					/*
+					 * Advance to the next mapping if
+					 * this one is used up.
+					 */
+					if (ra_offset ==
+					    map[ra_index].br_blockcount) {
+						ra_offset = 0;
+						ra_index++;
+					}
+				}
+			}
+			/*
+			 * Having done a read, we need to set a new offset.
+			 */
+			newoff = XFS_DIR2_DB_OFF_TO_BYTE(mp, curdb, 0);
+			/*
+			 * Start of the current block.
+			 */
+			if (curoff < newoff)
+				curoff = newoff;
+			/*
+			 * Make sure we're in the right block.
+			 */
+			else if (curoff > newoff)
+				ASSERT(XFS_DIR2_BYTE_TO_DB(mp, curoff) ==
+				       curdb);
+			data = bp->data;
+			xfs_dir2_data_check(dp, bp);
+			/*
+			 * Find our position in the block.
+			 */
+			ptr = (char *)&data->u;
+			byteoff = XFS_DIR2_BYTE_TO_OFF(mp, curoff);
+			/*
+			 * Skip past the header.
+			 */
+			if (byteoff == 0)
+				curoff += (uint)sizeof(data->hdr);
+			/*
+			 * Skip past entries until we reach our offset.
+			 */
+			else {
+				while ((char *)ptr - (char *)data < byteoff) {
+					dup = (xfs_dir2_data_unused_t *)ptr;
+
+					if (INT_GET(dup->freetag, ARCH_CONVERT)
+						  == XFS_DIR2_DATA_FREE_TAG) {
+
+						length = INT_GET(dup->length,
+								 ARCH_CONVERT);
+						ptr += length;
+						continue;
+					}
+					dep = (xfs_dir2_data_entry_t *)ptr;
+					length =
+					   XFS_DIR2_DATA_ENTSIZE(dep->namelen);
+					ptr += length;
+				}
+				/*
+				 * Now set our real offset.
+				 */
+				curoff =
+					XFS_DIR2_DB_OFF_TO_BYTE(mp,
+					    XFS_DIR2_BYTE_TO_DB(mp, curoff),
+					    (char *)ptr - (char *)data);
+			}
+		}
+		/*
+		 * We have a pointer to an entry.
+		 * Is it a live one?
+		 */
+		dup = (xfs_dir2_data_unused_t *)ptr;
+		/*
+		 * No, it's unused, skip over it.
+		 */
+		if (INT_GET(dup->freetag, ARCH_CONVERT)
+						== XFS_DIR2_DATA_FREE_TAG) {
+			length = INT_GET(dup->length, ARCH_CONVERT);
+			ptr += length;
+			curoff += length;
+			continue;
+		}
+
+		/*
+		 * Copy the entry into the putargs, and try formatting it.
+		 */
+		dep = (xfs_dir2_data_entry_t *)ptr;
+
+		p.namelen = dep->namelen;
+
+		length = XFS_DIR2_DATA_ENTSIZE(p.namelen);
+
+		/*
+		 * NOTE! Linux "filldir" semantics require that the
+		 *	 offset "cookie" be for this entry, not the
+		 *	 next; all the actual shuffling to make it
+		 *	 "look right" to the user is done in filldir.
+		 */
+		p.cook = XFS_DIR2_BYTE_TO_DATAPTR(mp, curoff);
+
+#if XFS_BIG_FILESYSTEMS
+		p.ino = INT_GET(dep->inumber, ARCH_CONVERT) + mp->m_inoadd;
+#else
+		p.ino = INT_GET(dep->inumber, ARCH_CONVERT);
+#endif
+		p.name = (char *)dep->name;
+
+		error = p.put(&p);
+
+		/*
+		 * Won't fit.  Return to caller.
+		 */
+		if (!p.done) {
+			eof = 0;
+			break;
+		}
+		/*
+		 * Advance to next entry in the block.
+		 */
+		ptr += length;
+		curoff += length;
+	}
+
+	/*
+	 * All done.  Set output offset value to current offset.
+	 */
+	*eofp = eof;
+	if (curoff > XFS_DIR2_DATAPTR_TO_BYTE(mp, XFS_DIR2_MAX_DATAPTR))
+		uio->uio_offset = XFS_DIR2_MAX_DATAPTR;
+	else
+		uio->uio_offset = XFS_DIR2_BYTE_TO_DATAPTR(mp, curoff);
+	kmem_free(map, map_size * sizeof(*map));
+	if (bp)
+		xfs_da_brelse(tp, bp);
+	return error;
+}
+
+/*
+ * Initialize a new leaf block, leaf1 or leafn magic accepted.
+ */
+int
+xfs_dir2_leaf_init(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dir2_db_t		bno,		/* directory block number */
+	xfs_dabuf_t		**bpp,		/* out: leaf buffer */
+	int			magic)		/* magic number for block */
+{
+	xfs_dabuf_t		*bp;		/* leaf buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	dp = args->dp;
+	ASSERT(dp != NULL);
+	tp = args->trans;
+	mp = dp->i_mount;
+	ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) &&
+	       bno < XFS_DIR2_FREE_FIRSTDB(mp));
+	/*
+	 * Get the buffer for the block.
+	 */
+	error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, bno), -1, &bp,
+		XFS_DATA_FORK);
+	if (error) {
+		return error;
+	}
+	ASSERT(bp != NULL);
+	leaf = bp->data;
+	/*
+	 * Initialize the header.
+	 */
+	INT_SET(leaf->hdr.info.magic, ARCH_CONVERT, magic);
+	INT_ZERO(leaf->hdr.info.forw, ARCH_CONVERT);
+	INT_ZERO(leaf->hdr.info.back, ARCH_CONVERT);
+	INT_ZERO(leaf->hdr.count, ARCH_CONVERT);
+	INT_ZERO(leaf->hdr.stale, ARCH_CONVERT);
+	xfs_dir2_leaf_log_header(tp, bp);
+	/*
+	 * If it's a leaf-format directory initialize the tail.
+	 * In this case our caller has the real bests table to copy into
+	 * the block.
+	 */
+	if (magic == XFS_DIR2_LEAF1_MAGIC) {
+		ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+		INT_ZERO(ltp->bestcount, ARCH_CONVERT);
+		xfs_dir2_leaf_log_tail(tp, bp);
+	}
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Log the bests entries indicated from a leaf1 block.
+ */
+void
+xfs_dir2_leaf_log_bests(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp,		/* leaf buffer */
+	int			first,		/* first entry to log */
+	int			last)		/* last entry to log */
+{
+	xfs_dir2_data_off_t	*firstb;	/* pointer to first entry */
+	xfs_dir2_data_off_t	*lastb;		/* pointer to last entry */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC);
+	ltp = XFS_DIR2_LEAF_TAIL_P(tp->t_mountp, leaf);
+	firstb = XFS_DIR2_LEAF_BESTS_P_ARCH(ltp, ARCH_CONVERT) + first;
+	lastb = XFS_DIR2_LEAF_BESTS_P_ARCH(ltp, ARCH_CONVERT) + last;
+	xfs_da_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf),
+		(uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1));
+}
+
+/*
+ * Log the leaf entries indicated from a leaf1 or leafn block.
+ */
+void
+xfs_dir2_leaf_log_ents(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp,		/* leaf buffer */
+	int			first,		/* first entry to log */
+	int			last)		/* last entry to log */
+{
+	xfs_dir2_leaf_entry_t	*firstlep;	/* pointer to first entry */
+	xfs_dir2_leaf_entry_t	*lastlep;	/* pointer to last entry */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC ||
+	       INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+	firstlep = &leaf->ents[first];
+	lastlep = &leaf->ents[last];
+	xfs_da_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf),
+		(uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
+}
+
+/*
+ * Log the header of the leaf1 or leafn block.
+ */
+void
+xfs_dir2_leaf_log_header(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp)		/* leaf buffer */
+{
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC ||
+	       INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+	xfs_da_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf),
+		(uint)(sizeof(leaf->hdr) - 1));
+}
+
+/*
+ * Log the tail of the leaf1 block.
+ */
+void
+xfs_dir2_leaf_log_tail(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp)		/* leaf buffer */
+{
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+
+	mp = tp->t_mountp;
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAF1_MAGIC);
+	ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+	xfs_da_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf),
+		(uint)(mp->m_dirblksize - 1));
+}
+
+/*
+ * Look up the entry referred to by args in the leaf format directory.
+ * Most of the work is done by the xfs_dir2_leaf_lookup_int routine which
+ * is also used by the node-format code.
+ */
+int
+xfs_dir2_leaf_lookup(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_dabuf_t		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	int			index;		/* found entry index */
+	xfs_dabuf_t		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args("leaf_lookup", args);
+	/*
+	 * Look up name in the leaf block, returning both buffers and index.
+	 */
+	if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+		return error;
+	}
+	tp = args->trans;
+	dp = args->dp;
+	xfs_dir2_leaf_check(dp, lbp);
+	leaf = lbp->data;
+	/*
+	 * Get to the leaf entry and contained data entry address.
+	 */
+	lep = &leaf->ents[index];
+	/*
+	 * Point to the data entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)dbp->data +
+	       XFS_DIR2_DATAPTR_TO_OFF(dp->i_mount, INT_GET(lep->address, ARCH_CONVERT)));
+	/*
+	 * Return the found inode number.
+	 */
+	args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
+	xfs_da_brelse(tp, dbp);
+	xfs_da_brelse(tp, lbp);
+	return XFS_ERROR(EEXIST);
+}
+
+/*
+ * Look up name/hash in the leaf block.
+ * Fill in indexp with the found index, and dbpp with the data buffer.
+ * If not found dbpp will be NULL, and ENOENT comes back.
+ * lbpp will always be filled in with the leaf buffer unless there's an error.
+ */
+static int					/* error */
+xfs_dir2_leaf_lookup_int(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		**lbpp,		/* out: leaf buffer */
+	int			*indexp,	/* out: index in leaf block */
+	xfs_dabuf_t		**dbpp)		/* out: data buffer */
+{
+	xfs_dir2_db_t		curdb;		/* current data block number */
+	xfs_dabuf_t		*dbp;		/* data buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	int			index;		/* index in leaf block */
+	xfs_dabuf_t		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_db_t		newdb;		/* new data block number */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	/*
+	 * Read the leaf block into the buffer.
+	 */
+	if ((error =
+	    xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
+		    XFS_DATA_FORK))) {
+		return error;
+	}
+	*lbpp = lbp;
+	leaf = lbp->data;
+	xfs_dir2_leaf_check(dp, lbp);
+	/*
+	 * Look for the first leaf entry with our hash value.
+	 */
+	index = xfs_dir2_leaf_search_hash(args, lbp);
+	/*
+	 * Loop over all the entries with the right hash value
+	 * looking to match the name.
+	 */
+	for (lep = &leaf->ents[index], dbp = NULL, curdb = -1;
+	     index < INT_GET(leaf->hdr.count, ARCH_CONVERT) && INT_GET(lep->hashval, ARCH_CONVERT) == args->hashval;
+	     lep++, index++) {
+		/*
+		 * Skip over stale leaf entries.
+		 */
+		if (INT_GET(lep->address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Get the new data block number.
+		 */
+		newdb = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
+		/*
+		 * If it's not the same as the old data block number,
+		 * need to pitch the old one and read the new one.
+		 */
+		if (newdb != curdb) {
+			if (dbp)
+				xfs_da_brelse(tp, dbp);
+			if ((error =
+			    xfs_da_read_buf(tp, dp,
+				    XFS_DIR2_DB_TO_DA(mp, newdb), -1, &dbp,
+				    XFS_DATA_FORK))) {
+				xfs_da_brelse(tp, lbp);
+				return error;
+			}
+			xfs_dir2_data_check(dp, dbp);
+			curdb = newdb;
+		}
+		/*
+		 * Point to the data entry.
+		 */
+		dep = (xfs_dir2_data_entry_t *)
+		      ((char *)dbp->data +
+		       XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT)));
+		/*
+		 * If it matches then return it.
+		 */
+		if (dep->namelen == args->namelen &&
+		    dep->name[0] == args->name[0] &&
+		    bcmp(dep->name, args->name, args->namelen) == 0) {
+			*dbpp = dbp;
+			*indexp = index;
+			return 0;
+		}
+	}
+	/*
+	 * No match found, return ENOENT.
+	 */
+	ASSERT(args->oknoent);
+	if (dbp)
+		xfs_da_brelse(tp, dbp);
+	xfs_da_brelse(tp, lbp);
+	return XFS_ERROR(ENOENT);
+}
+
+/*
+ * Remove an entry from a leaf format directory.
+ */
+int						/* error */
+xfs_dir2_leaf_removename(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_dir2_data_off_t	*bestsp;	/* leaf block best freespace */
+	xfs_dir2_data_t		*data;		/* data block structure */
+	xfs_dir2_db_t		db;		/* data block number */
+	xfs_dabuf_t		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry structure */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	xfs_dir2_db_t		i;		/* temporary data block # */
+	int			index;		/* index into leaf entries */
+	xfs_dabuf_t		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to rescan data frees */
+	xfs_dir2_data_off_t	oldbest;	/* old value of best free */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args("leaf_removename", args);
+	/*
+	 * Lookup the leaf entry, get the leaf and data blocks read in.
+	 */
+	if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+		return error;
+	}
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = lbp->data;
+	data = dbp->data;
+	xfs_dir2_data_check(dp, dbp);
+	/*
+	 * Point to the leaf entry, use that to point to the data entry.
+	 */
+	lep = &leaf->ents[index];
+	db = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)data + XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT)));
+	needscan = needlog = 0;
+	oldbest = INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT);
+	ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+	bestsp = XFS_DIR2_LEAF_BESTS_P_ARCH(ltp, ARCH_CONVERT);
+	ASSERT(INT_GET(bestsp[db], ARCH_CONVERT) == oldbest);
+	/*
+	 * Mark the former data entry unused.
+	 */
+	xfs_dir2_data_make_free(tp, dbp,
+		(xfs_dir2_data_aoff_t)((char *)dep - (char *)data),
+		XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan);
+	/*
+	 * We just mark the leaf entry stale by putting a null in it.
+	 */
+	INT_MOD(leaf->hdr.stale, ARCH_CONVERT, +1);
+	xfs_dir2_leaf_log_header(tp, lbp);
+	INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_NULL_DATAPTR);
+	xfs_dir2_leaf_log_ents(tp, lbp, index, index);
+	/*
+	 * Scan the freespace in the data block again if necessary,
+	 * log the data block header if necessary.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(mp, data, &needlog, NULL);
+	if (needlog)
+		xfs_dir2_data_log_header(tp, dbp);
+	/*
+	 * If the longest freespace in the data block has changed,
+	 * put the new value in the bests table and log that.
+	 */
+	if (INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) != oldbest) {
+		INT_COPY(bestsp[db], data->hdr.bestfree[0].length, ARCH_CONVERT);
+		xfs_dir2_leaf_log_bests(tp, lbp, db, db);
+	}
+	xfs_dir2_data_check(dp, dbp);
+	/*
+	 * If the data block is now empty then get rid of the data block.
+	 */
+	if (INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) ==
+	    mp->m_dirblksize - (uint)sizeof(data->hdr)) {
+		ASSERT(db != mp->m_dirdatablk);
+		if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
+			/*
+			 * Nope, can't get rid of it because it caused
+			 * allocation of a bmap btree block to do so.
+			 * Just go on, returning success, leaving the
+			 * empty block in place.
+			 */
+			if (error == ENOSPC && args->total == 0) {
+				xfs_da_buf_done(dbp);
+				error = 0;
+			}
+			xfs_dir2_leaf_check(dp, lbp);
+			xfs_da_buf_done(lbp);
+			return error;
+		}
+		dbp = NULL;
+		/*
+		 * If this is the last data block then compact the
+		 * bests table by getting rid of entries.
+		 */
+		if (db == INT_GET(ltp->bestcount, ARCH_CONVERT) - 1) {
+			/*
+			 * Look for the last active entry (i).
+			 */
+			for (i = db - 1; i > 0; i--) {
+				if (INT_GET(bestsp[i], ARCH_CONVERT) != NULLDATAOFF)
+					break;
+			}
+			/*
+			 * Copy the table down so inactive entries at the
+			 * end are removed.
+			 */
+			ovbcopy(bestsp, &bestsp[db - i],
+				(INT_GET(ltp->bestcount, ARCH_CONVERT) - (db - i)) * sizeof(*bestsp));
+			INT_MOD(ltp->bestcount, ARCH_CONVERT, -(db - i));
+			xfs_dir2_leaf_log_tail(tp, lbp);
+			xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
+		} else
+			INT_SET(bestsp[db], ARCH_CONVERT, NULLDATAOFF);
+	}
+	/*
+	 * If the data block was not the first one, drop it.
+	 */
+	else if (db != mp->m_dirdatablk && dbp != NULL) {
+		xfs_da_buf_done(dbp);
+		dbp = NULL;
+	}
+	xfs_dir2_leaf_check(dp, lbp);
+	/*
+	 * See if we can convert to block form.
+	 */
+	return xfs_dir2_leaf_to_block(args, lbp, dbp);
+}
+
+/*
+ * Replace the inode number in a leaf format directory entry.
+ */
+int						/* error */
+xfs_dir2_leaf_replace(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_dabuf_t		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	int			index;		/* index of leaf entry */
+	xfs_dabuf_t		*lbp;		/* leaf buffer */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args("leaf_replace", args);
+	/*
+	 * Look up the entry.
+	 */
+	if ((error = xfs_dir2_leaf_lookup_int(args, &lbp, &index, &dbp))) {
+		return error;
+	}
+	dp = args->dp;
+	leaf = lbp->data;
+	/*
+	 * Point to the leaf entry, get data address from it.
+	 */
+	lep = &leaf->ents[index];
+	/*
+	 * Point to the data entry.
+	 */
+	dep = (xfs_dir2_data_entry_t *)
+	      ((char *)dbp->data +
+	       XFS_DIR2_DATAPTR_TO_OFF(dp->i_mount, INT_GET(lep->address, ARCH_CONVERT)));
+	ASSERT(args->inumber != INT_GET(dep->inumber, ARCH_CONVERT));
+	/*
+	 * Put the new inode number in, log it.
+	 */
+	INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+	tp = args->trans;
+	xfs_dir2_data_log_entry(tp, dbp, dep);
+	xfs_da_buf_done(dbp);
+	xfs_dir2_leaf_check(dp, lbp);
+	xfs_da_brelse(tp, lbp);
+	return 0;
+}
+
+/*
+ * Return index in the leaf block (lbp) which is either the first
+ * one with this hash value, or if there are none, the insert point
+ * for that hash value.
+ */
+int						/* index value */
+xfs_dir2_leaf_search_hash(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		*lbp)		/* leaf buffer */
+{
+	xfs_dahash_t		hash=0;		/* hash from this entry */
+	xfs_dahash_t		hashwant;	/* hash value looking for */
+	int			high;		/* high leaf index */
+	int			low;		/* low leaf index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	int			mid=0;		/* current leaf index */
+
+	leaf = lbp->data;
+#ifndef __KERNEL__
+	if (INT_ISZERO(leaf->hdr.count, ARCH_CONVERT))
+		return 0;
+#endif
+	/*
+	 * Note, the table cannot be empty, so we have to go through the loop.
+	 * Binary search the leaf entries looking for our hash value.
+	 */
+	for (lep = leaf->ents, low = 0, high = INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1,
+		hashwant = args->hashval;
+	     low <= high; ) {
+		mid = (low + high) >> 1;
+		if ((hash = INT_GET(lep[mid].hashval, ARCH_CONVERT)) == hashwant)
+			break;
+		if (hash < hashwant)
+			low = mid + 1;
+		else
+			high = mid - 1;
+	}
+	/*
+	 * Found one, back up through all the equal hash values.
+	 */
+	if (hash == hashwant) {
+		while (mid > 0 && INT_GET(lep[mid - 1].hashval, ARCH_CONVERT) == hashwant) {
+			mid--;
+		}
+	}
+	/*
+	 * Need to point to an entry higher than ours.
+	 */
+	else if (hash < hashwant)
+		mid++;
+	return mid;
+}
+
+/*
+ * Trim off a trailing data block.  We know it's empty since the leaf
+ * freespace table says so.
+ */
+int						/* error */
+xfs_dir2_leaf_trim_data(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		*lbp,		/* leaf buffer */
+	xfs_dir2_db_t		db)		/* data block number */
+{
+	xfs_dir2_data_off_t	*bestsp;	/* leaf bests table */
+#ifdef DEBUG
+	xfs_dir2_data_t		*data;		/* data block structure */
+#endif
+	xfs_dabuf_t		*dbp;		/* data block buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	/*
+	 * Read the offending data block.  We need its buffer.
+	 */
+	if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, db), -1, &dbp,
+			XFS_DATA_FORK))) {
+		return error;
+	}
+#ifdef DEBUG
+	data = dbp->data;
+	ASSERT(INT_GET(data->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC);
+#endif
+	/* this seems to be an error
+	 * data is only valid if DEBUG is defined?
+	 * RMC 09/08/1999
+	 */
+
+	leaf = lbp->data;
+	ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+	ASSERT(INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) ==
+	       mp->m_dirblksize - (uint)sizeof(data->hdr));
+	ASSERT(db == INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
+	/*
+	 * Get rid of the data block.
+	 */
+	if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
+		ASSERT(error != ENOSPC);
+		xfs_da_brelse(tp, dbp);
+		return error;
+	}
+	/*
+	 * Eliminate the last bests entry from the table.
+	 */
+	bestsp = XFS_DIR2_LEAF_BESTS_P_ARCH(ltp, ARCH_CONVERT);
+	INT_MOD(ltp->bestcount, ARCH_CONVERT, -1);
+	ovbcopy(&bestsp[0], &bestsp[1], INT_GET(ltp->bestcount, ARCH_CONVERT) * sizeof(*bestsp));
+	xfs_dir2_leaf_log_tail(tp, lbp);
+	xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
+	return 0;
+}
+
+/*
+ * Convert node form directory to leaf form directory.
+ * The root of the node form dir needs to already be a LEAFN block.
+ * Just return if we can't do anything.
+ */
+int						/* error */
+xfs_dir2_node_to_leaf(
+	xfs_da_state_t		*state)		/* directory operation state */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	xfs_dabuf_t		*fbp;		/* buffer for freespace block */
+	xfs_fileoff_t		fo;		/* freespace file offset */
+	xfs_dir2_free_t		*free;		/* freespace structure */
+	xfs_dabuf_t		*lbp;		/* buffer for leaf block */
+	xfs_dir2_leaf_tail_t	*ltp;		/* tail of leaf structure */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			rval;		/* successful free trim? */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	/*
+	 * There's more than a leaf level in the btree, so there must
+	 * be multiple leafn blocks.  Give up.
+	 */
+	if (state->path.active > 1)
+		return 0;
+	args = state->args;
+	xfs_dir2_trace_args("node_to_leaf", args);
+	mp = state->mp;
+	dp = args->dp;
+	tp = args->trans;
+	/*
+	 * Get the last offset in the file.
+	 */
+	if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) {
+		return error;
+	}
+	fo -= mp->m_dirblkfsbs;
+	/*
+	 * If there are freespace blocks other than the first one,
+	 * take this opportunity to remove trailing empty freespace blocks
+	 * that may have been left behind during no-space-reservation
+	 * operations.
+	 */
+	while (fo > mp->m_dirfreeblk) {
+		if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) {
+			return error;
+		}
+		if (rval)
+			fo -= mp->m_dirblkfsbs;
+		else
+			return 0;
+	}
+	/*
+	 * Now find the block just before the freespace block.
+	 */
+	if ((error = xfs_bmap_last_before(tp, dp, &fo, XFS_DATA_FORK))) {
+		return error;
+	}
+	/*
+	 * If it's not the single leaf block, give up.
+	 */
+	if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + mp->m_dirblksize)
+		return 0;
+	lbp = state->path.blk[0].bp;
+	leaf = lbp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+	/*
+	 * Read the freespace block.
+	 */
+	if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp,
+			XFS_DATA_FORK))) {
+		return error;
+	}
+	free = fbp->data;
+	ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+	ASSERT(INT_ISZERO(free->hdr.firstdb, ARCH_CONVERT));
+	/*
+	 * Now see if the leafn and free data will fit in a leaf1.
+	 * If not, release the buffer and give up.
+	 */
+	if ((uint)sizeof(leaf->hdr) +
+	    (INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT)) * (uint)sizeof(leaf->ents[0]) +
+	    INT_GET(free->hdr.nvalid, ARCH_CONVERT) * (uint)sizeof(leaf->bests[0]) +
+	    (uint)sizeof(leaf->tail) >
+	    mp->m_dirblksize) {
+		xfs_da_brelse(tp, fbp);
+		return 0;
+	}
+	/*
+	 * If the leaf has any stale entries in it, compress them out.
+	 * The compact routine will log the header.
+	 */
+	if (INT_GET(leaf->hdr.stale, ARCH_CONVERT))
+		xfs_dir2_leaf_compact(args, lbp);
+	else
+		xfs_dir2_leaf_log_header(tp, lbp);
+	INT_SET(leaf->hdr.info.magic, ARCH_CONVERT, XFS_DIR2_LEAF1_MAGIC);
+	/*
+	 * Set up the leaf tail from the freespace block.
+	 */
+	ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+	INT_COPY(ltp->bestcount, free->hdr.nvalid, ARCH_CONVERT);
+	/*
+	 * Set up the leaf bests table.
+	 */
+	bcopy(free->bests, XFS_DIR2_LEAF_BESTS_P_ARCH(ltp, ARCH_CONVERT),
+		INT_GET(ltp->bestcount, ARCH_CONVERT) * sizeof(leaf->bests[0]));
+	xfs_dir2_leaf_log_bests(tp, lbp, 0, INT_GET(ltp->bestcount, ARCH_CONVERT) - 1);
+	xfs_dir2_leaf_log_tail(tp, lbp);
+	xfs_dir2_leaf_check(dp, lbp);
+	/*
+	 * Get rid of the freespace block.
+	 */
+	error = xfs_dir2_shrink_inode(args, XFS_DIR2_FREE_FIRSTDB(mp), fbp);
+	if (error) {
+		/*
+		 * This can't fail here because it can only happen when
+		 * punching out the middle of an extent, and this is an
+		 * isolated block.
+		 */
+		ASSERT(error != ENOSPC);
+		return error;
+	}
+	fbp = NULL;
+	/*
+	 * Now see if we can convert the single-leaf directory
+	 * down to a block form directory.
+	 * This routine always kills the dabuf for the leaf, so
+	 * eliminate it from the path.
+	 */
+	error = xfs_dir2_leaf_to_block(args, lbp, NULL);
+	state->path.blk[0].bp = NULL;
+	return error;
+}
diff --git a/fs/xfs/xfs_dir2_leaf.h b/fs/xfs/xfs_dir2_leaf.h
new file mode 100644
index 000000000000..778645e173d2
--- /dev/null
+++ b/fs/xfs/xfs_dir2_leaf.h
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR2_LEAF_H__
+#define __XFS_DIR2_LEAF_H__
+
+/*
+ * Directory version 2, leaf block structures.
+ */
+
+struct dirent;
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Constants.
+ */
+
+/*
+ * Offset of the leaf/node space.  First block in this space
+ * is the btree root.
+ */
+#define XFS_DIR2_LEAF_SPACE	1
+#define XFS_DIR2_LEAF_OFFSET	(XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE)
+#define XFS_DIR2_LEAF_FIRSTDB(mp)	\
+	XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_LEAF_OFFSET)
+
+/*
+ * Types.
+ */
+
+/*
+ * Offset in data space of a data entry.
+ */
+typedef __uint32_t	xfs_dir2_dataptr_t;
+#define XFS_DIR2_MAX_DATAPTR	((xfs_dir2_dataptr_t)0x7fffffff)
+#define XFS_DIR2_NULL_DATAPTR	((xfs_dir2_dataptr_t)0)
+
+/*
+ * Structures.
+ */
+
+/*
+ * Leaf block header.
+ */
+typedef struct xfs_dir2_leaf_hdr {
+	xfs_da_blkinfo_t	info;		/* header for da routines */
+	__uint16_t		count;		/* count of entries */
+	__uint16_t		stale;		/* count of stale entries */
+} xfs_dir2_leaf_hdr_t;
+
+/*
+ * Leaf block entry.
+ */
+typedef struct xfs_dir2_leaf_entry {
+	xfs_dahash_t		hashval;	/* hash value of name */
+	xfs_dir2_dataptr_t	address;	/* address of data entry */
+} xfs_dir2_leaf_entry_t;
+
+/*
+ * Leaf block tail.
+ */
+typedef struct xfs_dir2_leaf_tail {
+	__uint32_t		bestcount;
+} xfs_dir2_leaf_tail_t;
+
+/*
+ * Leaf block.
+ * bests and tail are at the end of the block for single-leaf only
+ * (magic = XFS_DIR2_LEAF1_MAGIC not XFS_DIR2_LEAFN_MAGIC).
+ */
+typedef struct xfs_dir2_leaf {
+	xfs_dir2_leaf_hdr_t	hdr;		/* leaf header */
+	xfs_dir2_leaf_entry_t	ents[1];	/* entries */
+						/* ... */
+	xfs_dir2_data_off_t	bests[1];	/* best free counts */
+	xfs_dir2_leaf_tail_t	tail;		/* leaf tail */
+} xfs_dir2_leaf_t;
+
+/*
+ * Macros.
+ * The DB blocks are logical directory block numbers, not filesystem blocks.
+ */
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_MAX_LEAF_ENTS)
+int
+xfs_dir2_max_leaf_ents(struct xfs_mount *mp);
+#define XFS_DIR2_MAX_LEAF_ENTS(mp)	\
+	xfs_dir2_max_leaf_ents(mp)
+#else
+#define XFS_DIR2_MAX_LEAF_ENTS(mp)	\
+	((int)(((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_leaf_hdr_t)) / \
+	       (uint)sizeof(xfs_dir2_leaf_entry_t)))
+#endif
+
+/*
+ * Get address of the bestcount field in the single-leaf block.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_LEAF_TAIL_P)
+xfs_dir2_leaf_tail_t *
+xfs_dir2_leaf_tail_p(struct xfs_mount *mp, xfs_dir2_leaf_t *lp);
+#define XFS_DIR2_LEAF_TAIL_P(mp,lp)	\
+	xfs_dir2_leaf_tail_p(mp, lp)
+#else
+#define XFS_DIR2_LEAF_TAIL_P(mp,lp)	\
+	((xfs_dir2_leaf_tail_t *)\
+	 ((char *)(lp) + (mp)->m_dirblksize - \
+	  (uint)sizeof(xfs_dir2_leaf_tail_t)))
+#endif
+
+/*
+ * Get address of the bests array in the single-leaf block.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_LEAF_BESTS_P)
+xfs_dir2_data_off_t *
+xfs_dir2_leaf_bests_p_arch(xfs_dir2_leaf_tail_t *ltp, xfs_arch_t arch);
+#define XFS_DIR2_LEAF_BESTS_P_ARCH(ltp,arch)	xfs_dir2_leaf_bests_p_arch(ltp,arch)
+#else
+#define XFS_DIR2_LEAF_BESTS_P_ARCH(ltp,arch)	\
+	((xfs_dir2_data_off_t *)(ltp) - INT_GET((ltp)->bestcount, arch))
+#endif
+
+/*
+ * Convert dataptr to byte in file space
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATAPTR_TO_BYTE)
+xfs_dir2_off_t
+xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp);
+#define XFS_DIR2_DATAPTR_TO_BYTE(mp,dp) xfs_dir2_dataptr_to_byte(mp, dp)
+#else
+#define XFS_DIR2_DATAPTR_TO_BYTE(mp,dp) \
+	((xfs_dir2_off_t)(dp) << XFS_DIR2_DATA_ALIGN_LOG)
+#endif
+
+/*
+ * Convert byte in file space to dataptr.  It had better be aligned.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BYTE_TO_DATAPTR)
+xfs_dir2_dataptr_t
+xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by);
+#define XFS_DIR2_BYTE_TO_DATAPTR(mp,by) xfs_dir2_byte_to_dataptr(mp,by)
+#else
+#define XFS_DIR2_BYTE_TO_DATAPTR(mp,by) \
+	((xfs_dir2_dataptr_t)((by) >> XFS_DIR2_DATA_ALIGN_LOG))
+#endif
+
+/*
+ * Convert dataptr to a block number
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATAPTR_TO_DB)
+xfs_dir2_db_t
+xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp);
+#define XFS_DIR2_DATAPTR_TO_DB(mp,dp)	xfs_dir2_dataptr_to_db(mp, dp)
+#else
+#define XFS_DIR2_DATAPTR_TO_DB(mp,dp)	\
+	XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_DATAPTR_TO_BYTE(mp, dp))
+#endif
+
+/*
+ * Convert dataptr to a byte offset in a block
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DATAPTR_TO_OFF)
+xfs_dir2_data_aoff_t
+xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp);
+#define XFS_DIR2_DATAPTR_TO_OFF(mp,dp)	xfs_dir2_dataptr_to_off(mp, dp)
+#else
+#define XFS_DIR2_DATAPTR_TO_OFF(mp,dp)	\
+	XFS_DIR2_BYTE_TO_OFF(mp, XFS_DIR2_DATAPTR_TO_BYTE(mp, dp))
+#endif
+
+/*
+ * Convert block and offset to byte in space
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_OFF_TO_BYTE)
+xfs_dir2_off_t
+xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db,
+			xfs_dir2_data_aoff_t o);
+#define XFS_DIR2_DB_OFF_TO_BYTE(mp,db,o)	\
+	xfs_dir2_db_off_to_byte(mp, db, o)
+#else
+#define XFS_DIR2_DB_OFF_TO_BYTE(mp,db,o)	\
+	(((xfs_dir2_off_t)(db) << \
+	 ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)) + (o))
+#endif
+
+/*
+ * Convert byte in space to (DB) block
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BYTE_TO_DB)
+xfs_dir2_db_t xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by);
+#define XFS_DIR2_BYTE_TO_DB(mp,by)	xfs_dir2_byte_to_db(mp, by)
+#else
+#define XFS_DIR2_BYTE_TO_DB(mp,by)	\
+	((xfs_dir2_db_t)((by) >> \
+			 ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)))
+#endif
+
+/*
+ * Convert byte in space to (DA) block
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BYTE_TO_DA)
+xfs_dablk_t xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by);
+#define XFS_DIR2_BYTE_TO_DA(mp,by)	xfs_dir2_byte_to_da(mp, by)
+#else
+#define XFS_DIR2_BYTE_TO_DA(mp,by)	\
+	XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_BYTE_TO_DB(mp, by))
+#endif
+
+/*
+ * Convert byte in space to offset in a block
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_BYTE_TO_OFF)
+xfs_dir2_data_aoff_t
+xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by);
+#define XFS_DIR2_BYTE_TO_OFF(mp,by)	xfs_dir2_byte_to_off(mp, by)
+#else
+#define XFS_DIR2_BYTE_TO_OFF(mp,by)	\
+	((xfs_dir2_data_aoff_t)((by) & \
+				((1 << ((mp)->m_sb.sb_blocklog + \
+					(mp)->m_sb.sb_dirblklog)) - 1)))
+#endif
+
+/*
+ * Convert block and offset to dataptr
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_OFF_TO_DATAPTR)
+xfs_dir2_dataptr_t
+xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db,
+			   xfs_dir2_data_aoff_t o);
+#define XFS_DIR2_DB_OFF_TO_DATAPTR(mp,db,o)	\
+	xfs_dir2_db_off_to_dataptr(mp, db, o)
+#else
+#define XFS_DIR2_DB_OFF_TO_DATAPTR(mp,db,o)	\
+	XFS_DIR2_BYTE_TO_DATAPTR(mp, XFS_DIR2_DB_OFF_TO_BYTE(mp, db, o))
+#endif
+
+/*
+ * Convert block (DB) to block (dablk)
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_TO_DA)
+xfs_dablk_t xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db);
+#define XFS_DIR2_DB_TO_DA(mp,db)	xfs_dir2_db_to_da(mp, db)
+#else
+#define XFS_DIR2_DB_TO_DA(mp,db)	\
+	((xfs_dablk_t)((db) << (mp)->m_sb.sb_dirblklog))
+#endif
+
+/*
+ * Convert block (dablk) to block (DB)
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DA_TO_DB)
+xfs_dir2_db_t xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da);
+#define XFS_DIR2_DA_TO_DB(mp,da)	xfs_dir2_da_to_db(mp, da)
+#else
+#define XFS_DIR2_DA_TO_DB(mp,da)	\
+	((xfs_dir2_db_t)((da) >> (mp)->m_sb.sb_dirblklog))
+#endif
+
+/*
+ * Convert block (dablk) to byte offset in space
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DA_TO_BYTE)
+xfs_dir2_off_t xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da);
+#define XFS_DIR2_DA_TO_BYTE(mp,da)	xfs_dir2_da_to_byte(mp, da)
+#else
+#define XFS_DIR2_DA_TO_BYTE(mp,da)	\
+	XFS_DIR2_DB_OFF_TO_BYTE(mp, XFS_DIR2_DA_TO_DB(mp, da), 0)
+#endif
+
+/*
+ * Function declarations.
+ */
+
+extern int
+	xfs_dir2_block_to_leaf(struct xfs_da_args *args, struct xfs_dabuf *dbp);
+
+extern int
+	xfs_dir2_leaf_addname(struct xfs_da_args *args);
+
+extern void
+	xfs_dir2_leaf_compact(struct xfs_da_args *args, struct xfs_dabuf *bp);
+
+extern void
+	xfs_dir2_leaf_compact_x1(struct xfs_dabuf *bp, int *indexp,
+				 int *lowstalep, int *highstalep, int *lowlogp,
+				 int *highlogp);
+
+extern int
+	xfs_dir2_leaf_getdents(struct xfs_trans *tp, struct xfs_inode *dp,
+			       struct uio *uio, int *eofp, struct xfs_dirent *dbp,
+			       xfs_dir2_put_t put);
+
+extern int
+	xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno,
+			   struct xfs_dabuf **bpp, int magic);
+
+extern void
+	xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_dabuf *bp,
+			       int first, int last);
+
+extern void
+	xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp,
+				int first, int last);
+
+extern void
+	xfs_dir2_leaf_log_header(struct xfs_trans *tp, struct xfs_dabuf *bp);
+
+extern void
+	xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_dabuf *bp);
+
+extern int
+	xfs_dir2_leaf_lookup(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_leaf_removename(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_leaf_replace(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_leaf_search_hash(struct xfs_da_args *args,
+				  struct xfs_dabuf *lbp);
+extern int
+	xfs_dir2_leaf_trim_data(struct xfs_da_args *args, struct xfs_dabuf *lbp,				xfs_dir2_db_t db);
+
+extern int
+	xfs_dir2_node_to_leaf(struct xfs_da_state *state);
+
+#endif	/* __XFS_DIR2_LEAF_H__ */
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
new file mode 100644
index 000000000000..99661539e595
--- /dev/null
+++ b/fs/xfs/xfs_dir2_node.c
@@ -0,0 +1,1976 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * xfs_dir2_node.c
+ * XFS directory implementation, version 2, node form files
+ * See data structures in xfs_dir2_node.h and xfs_da_btree.h.
+ */
+
+#include <xfs.h>
+
+/*
+ * Function declarations.
+ */
+static void xfs_dir2_free_log_header(xfs_trans_t *tp, xfs_dabuf_t *bp);
+static int xfs_dir2_leafn_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index);
+#ifdef DEBUG
+static void xfs_dir2_leafn_check(xfs_inode_t *dp, xfs_dabuf_t *bp);
+#else
+#define xfs_dir2_leafn_check(dp, bp)
+#endif
+static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, xfs_dabuf_t *bp_s,
+				    int start_s, xfs_dabuf_t *bp_d, int start_d,
+				    int count);
+static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state,
+				     xfs_da_state_blk_t *blk1,
+				     xfs_da_state_blk_t *blk2);
+static int xfs_dir2_leafn_remove(xfs_da_args_t *args, xfs_dabuf_t *bp,
+				 int index, xfs_da_state_blk_t *dblk,
+				 int *rval);
+static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
+				     xfs_da_state_blk_t *fblk);
+
+/*
+ * Log entries from a freespace block.
+ */
+void
+xfs_dir2_free_log_bests(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp,		/* freespace buffer */
+	int			first,		/* first entry to log */
+	int			last)		/* last entry to log */
+{
+	xfs_dir2_free_t		*free;		/* freespace structure */
+
+	free = bp->data;
+	ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+	xfs_da_log_buf(tp, bp,
+		(uint)((char *)&free->bests[first] - (char *)free),
+		(uint)((char *)&free->bests[last] - (char *)free +
+		       sizeof(free->bests[0]) - 1));
+}
+
+/*
+ * Log header from a freespace block.
+ */
+static void
+xfs_dir2_free_log_header(
+	xfs_trans_t		*tp,		/* transaction pointer */
+	xfs_dabuf_t		*bp)		/* freespace buffer */
+{
+	xfs_dir2_free_t		*free;		/* freespace structure */
+
+	free = bp->data;
+	ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+	xfs_da_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free),
+		(uint)(sizeof(xfs_dir2_free_hdr_t) - 1));
+}
+
+/*
+ * Convert a leaf-format directory to a node-format directory.
+ * We need to change the magic number of the leaf block, and copy
+ * the freespace table out of the leaf block into its own block.
+ */
+int						/* error */
+xfs_dir2_leaf_to_node(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		*lbp)		/* leaf buffer */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	xfs_dabuf_t		*fbp;		/* freespace buffer */
+	xfs_dir2_db_t		fdb;		/* freespace block number */
+	xfs_dir2_free_t		*free;		/* freespace structure */
+	xfs_dir2_data_off_t	*from;		/* pointer to freespace entry */
+	int			i;		/* leaf freespace index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_tail_t	*ltp;		/* leaf tail structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			n;		/* count of live freespc ents */
+	xfs_dir2_data_off_t	off;		/* freespace entry value */
+	xfs_dir2_data_off_t	*to;		/* pointer to freespace entry */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args_b("leaf_to_node", args, lbp);
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	/*
+	 * Add a freespace block to the directory.
+	 */
+	if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) {
+		return error;
+	}
+	ASSERT(fdb == XFS_DIR2_FREE_FIRSTDB(mp));
+	/*
+	 * Get the buffer for the new freespace block.
+	 */
+	if ((error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fdb), -1, &fbp,
+			XFS_DATA_FORK))) {
+		return error;
+	}
+	ASSERT(fbp != NULL);
+	free = fbp->data;
+	leaf = lbp->data;
+	ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf);
+	/*
+	 * Initialize the freespace block header.
+	 */
+	INT_SET(free->hdr.magic, ARCH_CONVERT, XFS_DIR2_FREE_MAGIC);
+	INT_ZERO(free->hdr.firstdb, ARCH_CONVERT);
+	ASSERT(INT_GET(ltp->bestcount, ARCH_CONVERT) <= (uint)dp->i_d.di_size / mp->m_dirblksize);
+	INT_COPY(free->hdr.nvalid, ltp->bestcount, ARCH_CONVERT);
+	/*
+	 * Copy freespace entries from the leaf block to the new block.
+	 * Count active entries.
+	 */
+	for (i = n = 0, from = XFS_DIR2_LEAF_BESTS_P_ARCH(ltp, ARCH_CONVERT), to = free->bests;
+	     i < INT_GET(ltp->bestcount, ARCH_CONVERT); i++, from++, to++) {
+		if ((off = INT_GET(*from, ARCH_CONVERT)) != NULLDATAOFF)
+			n++;
+		INT_SET(*to, ARCH_CONVERT, off);
+	}
+	INT_SET(free->hdr.nused, ARCH_CONVERT, n);
+	INT_SET(leaf->hdr.info.magic, ARCH_CONVERT, XFS_DIR2_LEAFN_MAGIC);
+	/*
+	 * Log everything.
+	 */
+	xfs_dir2_leaf_log_header(tp, lbp);
+	xfs_dir2_free_log_header(tp, fbp);
+	xfs_dir2_free_log_bests(tp, fbp, 0, INT_GET(free->hdr.nvalid, ARCH_CONVERT) - 1);
+	xfs_da_buf_done(fbp);
+	xfs_dir2_leafn_check(dp, lbp);
+	return 0;
+}
+
+/*
+ * Add a leaf entry to a leaf block in a node-form directory.
+ * The other work necessary is done from the caller.
+ */
+static int					/* error */
+xfs_dir2_leafn_add(
+	xfs_dabuf_t		*bp,		/* leaf buffer */
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			index)		/* insertion pt for new entry */
+{
+	int			compact;	/* compacting stale leaves */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			highstale;	/* next stale entry */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	int			lfloghigh;	/* high leaf entry logging */
+	int			lfloglow;	/* low leaf entry logging */
+	int			lowstale;	/* previous stale entry */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args_sb("leafn_add", args, index, bp);
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	leaf = bp->data;
+	/*
+	 * If there are already the maximum number of leaf entries in
+	 * the block, if there are no stale entries it won't fit.
+	 * Caller will do a split.  If there are stale entries we'll do
+	 * a compact.
+	 */
+	if (INT_GET(leaf->hdr.count, ARCH_CONVERT) == XFS_DIR2_MAX_LEAF_ENTS(mp)) {
+		if (INT_ISZERO(leaf->hdr.stale, ARCH_CONVERT))
+			return XFS_ERROR(ENOSPC);
+		compact = INT_GET(leaf->hdr.stale, ARCH_CONVERT) > 1;
+	} else
+		compact = 0;
+	ASSERT(index == 0 || INT_GET(leaf->ents[index - 1].hashval, ARCH_CONVERT) <= args->hashval);
+	ASSERT(index == INT_GET(leaf->hdr.count, ARCH_CONVERT) ||
+	       INT_GET(leaf->ents[index].hashval, ARCH_CONVERT) >= args->hashval);
+
+	if (args->justcheck)
+		return 0;
+
+	/*
+	 * Compact out all but one stale leaf entry.  Leaves behind
+	 * the entry closest to index.
+	 */
+	if (compact) {
+		xfs_dir2_leaf_compact_x1(bp, &index, &lowstale, &highstale,
+			&lfloglow, &lfloghigh);
+	}
+	/*
+	 * Set impossible logging indices for this case.
+	 */
+	else if (!INT_ISZERO(leaf->hdr.stale, ARCH_CONVERT)) {
+		lfloglow = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+		lfloghigh = -1;
+	}
+	/*
+	 * No stale entries, just insert a space for the new entry.
+	 */
+	if (INT_ISZERO(leaf->hdr.stale, ARCH_CONVERT)) {
+		lep = &leaf->ents[index];
+		if (index < INT_GET(leaf->hdr.count, ARCH_CONVERT))
+			ovbcopy(lep, lep + 1,
+				(INT_GET(leaf->hdr.count, ARCH_CONVERT) - index) * sizeof(*lep));
+		lfloglow = index;
+		lfloghigh = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+		INT_MOD(leaf->hdr.count, ARCH_CONVERT, +1);
+	}
+	/*
+	 * There are stale entries.  We'll use one for the new entry.
+	 */
+	else {
+		/*
+		 * If we didn't do a compact then we need to figure out
+		 * which stale entry will be used.
+		 */
+		if (compact == 0) {
+			/*
+			 * Find first stale entry before our insertion point.
+			 */
+			for (lowstale = index - 1;
+			     lowstale >= 0 &&
+				INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) !=
+				XFS_DIR2_NULL_DATAPTR;
+			     lowstale--)
+				continue;
+			/*
+			 * Find next stale entry after insertion point.
+			 * Stop looking if the answer would be worse than
+			 * lowstale already found.
+			 */
+			for (highstale = index;
+			     highstale < INT_GET(leaf->hdr.count, ARCH_CONVERT) &&
+				INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) !=
+				XFS_DIR2_NULL_DATAPTR &&
+				(lowstale < 0 ||
+				 index - lowstale - 1 >= highstale - index);
+			     highstale++)
+				continue;
+		}
+		/*
+		 * Using the low stale entry.
+		 * Shift entries up toward the stale slot.
+		 */
+		if (lowstale >= 0 &&
+		    (highstale == INT_GET(leaf->hdr.count, ARCH_CONVERT) ||
+		     index - lowstale - 1 < highstale - index)) {
+			ASSERT(INT_GET(leaf->ents[lowstale].address, ARCH_CONVERT) ==
+			       XFS_DIR2_NULL_DATAPTR);
+			ASSERT(index - lowstale - 1 >= 0);
+			if (index - lowstale - 1 > 0)
+				ovbcopy(&leaf->ents[lowstale + 1],
+					&leaf->ents[lowstale],
+					(index - lowstale - 1) * sizeof(*lep));
+			lep = &leaf->ents[index - 1];
+			lfloglow = MIN(lowstale, lfloglow);
+			lfloghigh = MAX(index - 1, lfloghigh);
+		}
+		/*
+		 * Using the high stale entry.
+		 * Shift entries down toward the stale slot.
+		 */
+		else {
+			ASSERT(INT_GET(leaf->ents[highstale].address, ARCH_CONVERT) ==
+			       XFS_DIR2_NULL_DATAPTR);
+			ASSERT(highstale - index >= 0);
+			if (highstale - index > 0)
+				ovbcopy(&leaf->ents[index],
+					&leaf->ents[index + 1],
+					(highstale - index) * sizeof(*lep));
+			lep = &leaf->ents[index];
+			lfloglow = MIN(index, lfloglow);
+			lfloghigh = MAX(highstale, lfloghigh);
+		}
+		INT_MOD(leaf->hdr.stale, ARCH_CONVERT, -1);
+	}
+	/*
+	 * Insert the new entry, log everything.
+	 */
+	INT_SET(lep->hashval, ARCH_CONVERT, args->hashval);
+	INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_DB_OFF_TO_DATAPTR(mp, args->blkno, args->index));
+	xfs_dir2_leaf_log_header(tp, bp);
+	xfs_dir2_leaf_log_ents(tp, bp, lfloglow, lfloghigh);
+	xfs_dir2_leafn_check(dp, bp);
+	return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check internal consistency of a leafn block.
+ */
+void
+xfs_dir2_leafn_check(
+	xfs_inode_t	*dp,			/* incore directory inode */
+	xfs_dabuf_t	*bp)			/* leaf buffer */
+{
+	int		i;			/* leaf index */
+	xfs_dir2_leaf_t *leaf;			/* leaf structure */
+	xfs_mount_t	*mp;			/* filesystem mount point */
+	int		stale;			/* count of stale leaves */
+
+	leaf = bp->data;
+	mp = dp->i_mount;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) <= XFS_DIR2_MAX_LEAF_ENTS(mp));
+	for (i = stale = 0; i < INT_GET(leaf->hdr.count, ARCH_CONVERT); i++) {
+		if (i + 1 < INT_GET(leaf->hdr.count, ARCH_CONVERT)) {
+			ASSERT(INT_GET(leaf->ents[i].hashval, ARCH_CONVERT) <=
+			       INT_GET(leaf->ents[i + 1].hashval, ARCH_CONVERT));
+		}
+		if (INT_GET(leaf->ents[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+			stale++;
+	}
+	ASSERT(INT_GET(leaf->hdr.stale, ARCH_CONVERT) == stale);
+}
+#endif	/* DEBUG */
+
+/*
+ * Return the last hash value in the leaf.
+ * Stale entries are ok.
+ */
+xfs_dahash_t					/* hash value */
+xfs_dir2_leafn_lasthash(
+	xfs_dabuf_t	*bp,			/* leaf buffer */
+	int		*count)			/* count of entries in leaf */
+{
+	xfs_dir2_leaf_t *leaf;			/* leaf structure */
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+	if (count)
+		*count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+	if (INT_ISZERO(leaf->hdr.count, ARCH_CONVERT))
+		return 0;
+	return INT_GET(leaf->ents[INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
+}
+
+/*
+ * Look up a leaf entry in a node-format leaf block.
+ * If this is an addname then the extrablk in state is a freespace block,
+ * otherwise it's a data block.
+ */
+int
+xfs_dir2_leafn_lookup_int(
+	xfs_dabuf_t		*bp,		/* leaf buffer */
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			*indexp,	/* out: leaf entry index */
+	xfs_da_state_t		*state)		/* state to fill in */
+{
+	xfs_dabuf_t		*curbp;		/* current data/free buffer */
+	xfs_dir2_db_t		curdb;		/* current data block number */
+	xfs_dir2_db_t		curfdb;		/* current free block number */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	int			fi;		/* free entry index */
+	xfs_dir2_free_t		*free=NULL;	/* free block structure */
+	int			index;		/* leaf entry index */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	int			length=0;	/* length of new data entry */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_db_t		newdb;		/* new data block number */
+	xfs_dir2_db_t		newfdb;		/* new free block number */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+#ifdef __KERNEL__
+	ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) > 0);
+#endif
+	xfs_dir2_leafn_check(dp, bp);
+	/*
+	 * Look up the hash value in the leaf entries.
+	 */
+	index = xfs_dir2_leaf_search_hash(args, bp);
+	/*
+	 * Do we have a buffer coming in?
+	 */
+	if (state->extravalid)
+		curbp = state->extrablk.bp;
+	else
+		curbp = NULL;
+	/*
+	 * For addname, it's a free block buffer, get the block number.
+	 */
+	if (args->addname) {
+		curfdb = curbp ? state->extrablk.blkno : -1;
+		curdb = -1;
+		length = XFS_DIR2_DATA_ENTSIZE(args->namelen);
+		if ((free = (curbp ? curbp->data : NULL)))
+			ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+	}
+	/*
+	 * For others, it's a data block buffer, get the block number.
+	 */
+	else {
+		curfdb = -1;
+		curdb = curbp ? state->extrablk.blkno : -1;
+	}
+	/*
+	 * Loop over leaf entries with the right hash value.
+	 */
+	for (lep = &leaf->ents[index];
+	     index < INT_GET(leaf->hdr.count, ARCH_CONVERT) && INT_GET(lep->hashval, ARCH_CONVERT) == args->hashval;
+	     lep++, index++) {
+		/*
+		 * Skip stale leaf entries.
+		 */
+		if (INT_GET(lep->address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Pull the data block number from the entry.
+		 */
+		newdb = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
+		/*
+		 * For addname, we're looking for a place to put the new entry.
+		 * We want to use a data block with an entry of equal
+		 * hash value to ours if there is one with room.
+		 */
+		if (args->addname) {
+			/*
+			 * If this block isn't the data block we already have
+			 * in hand, take a look at it.
+			 */
+			if (newdb != curdb) {
+				curdb = newdb;
+				/*
+				 * Convert the data block to the free block
+				 * holding its freespace information.
+				 */
+				newfdb = XFS_DIR2_DB_TO_FDB(mp, newdb);
+				/*
+				 * If it's not the one we have in hand,
+				 * read it in.
+				 */
+				if (newfdb != curfdb) {
+					/*
+					 * If we had one before, drop it.
+					 */
+					if (curbp)
+						xfs_da_brelse(tp, curbp);
+					/*
+					 * Read the free block.
+					 */
+					if ((error = xfs_da_read_buf(tp, dp,
+							XFS_DIR2_DB_TO_DA(mp,
+								newfdb),
+							-1, &curbp,
+							XFS_DATA_FORK))) {
+						return error;
+					}
+					curfdb = newfdb;
+					free = curbp->data;
+					ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) ==
+					       XFS_DIR2_FREE_MAGIC);
+					ASSERT((INT_GET(free->hdr.firstdb, ARCH_CONVERT) %
+						XFS_DIR2_MAX_FREE_BESTS(mp)) ==
+					       0);
+					ASSERT(INT_GET(free->hdr.firstdb, ARCH_CONVERT) <= curdb);
+					ASSERT(curdb <
+					       INT_GET(free->hdr.firstdb, ARCH_CONVERT) +
+					       INT_GET(free->hdr.nvalid, ARCH_CONVERT));
+				}
+				/*
+				 * Get the index for our entry.
+				 */
+				fi = XFS_DIR2_DB_TO_FDINDEX(mp, curdb);
+				/*
+				 * If it has room, return it.
+				 */
+				if (INT_GET(free->bests[fi], ARCH_CONVERT) == NULLDATAOFF) {
+					return XFS_ERROR(EFSCORRUPTED);
+				}
+				if (INT_GET(free->bests[fi], ARCH_CONVERT) >= length) {
+					*indexp = index;
+					state->extravalid = 1;
+					state->extrablk.bp = curbp;
+					state->extrablk.blkno = curfdb;
+					state->extrablk.index = fi;
+					state->extrablk.magic =
+						XFS_DIR2_FREE_MAGIC;
+					ASSERT(args->oknoent);
+					return XFS_ERROR(ENOENT);
+				}
+			}
+		}
+		/*
+		 * Not adding a new entry, so we really want to find
+		 * the name given to us.
+		 */
+		else {
+			/*
+			 * If it's a different data block, go get it.
+			 */
+			if (newdb != curdb) {
+				/*
+				 * If we had a block before, drop it.
+				 */
+				if (curbp)
+					xfs_da_brelse(tp, curbp);
+				/*
+				 * Read the data block.
+				 */
+				if ((error =
+				    xfs_da_read_buf(tp, dp,
+					    XFS_DIR2_DB_TO_DA(mp, newdb), -1,
+					    &curbp, XFS_DATA_FORK))) {
+					return error;
+				}
+				xfs_dir2_data_check(dp, curbp);
+				curdb = newdb;
+			}
+			/*
+			 * Point to the data entry.
+			 */
+			dep = (xfs_dir2_data_entry_t *)
+			      ((char *)curbp->data +
+			       XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT)));
+			/*
+			 * Compare the entry, return it if it matches.
+			 */
+			if (dep->namelen == args->namelen &&
+			    dep->name[0] == args->name[0] &&
+			    bcmp(dep->name, args->name, args->namelen) == 0) {
+				args->inumber = INT_GET(dep->inumber, ARCH_CONVERT);
+				*indexp = index;
+				state->extravalid = 1;
+				state->extrablk.bp = curbp;
+				state->extrablk.blkno = curdb;
+				state->extrablk.index =
+					(int)((char *)dep -
+					      (char *)curbp->data);
+				state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+				return XFS_ERROR(EEXIST);
+			}
+		}
+	}
+	/*
+	 * Didn't find a match.
+	 * If we are holding a buffer, give it back in case our caller
+	 * finds it useful.
+	 */
+	if ((state->extravalid = (curbp != NULL))) {
+		state->extrablk.bp = curbp;
+		state->extrablk.index = -1;
+		/*
+		 * For addname, giving back a free block.
+		 */
+		if (args->addname) {
+			state->extrablk.blkno = curfdb;
+			state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
+		}
+		/*
+		 * For other callers, giving back a data block.
+		 */
+		else {
+			state->extrablk.blkno = curdb;
+			state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+		}
+	}
+	/*
+	 * Return the final index, that will be the insertion point.
+	 */
+	*indexp = index;
+	ASSERT(index == INT_GET(leaf->hdr.count, ARCH_CONVERT) || args->oknoent);
+	return XFS_ERROR(ENOENT);
+}
+
+/*
+ * Move count leaf entries from source to destination leaf.
+ * Log entries and headers.  Stale entries are preserved.
+ */
+static void
+xfs_dir2_leafn_moveents(
+	xfs_da_args_t	*args,			/* operation arguments */
+	xfs_dabuf_t	*bp_s,			/* source leaf buffer */
+	int		start_s,		/* source leaf index */
+	xfs_dabuf_t	*bp_d,			/* destination leaf buffer */
+	int		start_d,		/* destination leaf index */
+	int		count)			/* count of leaves to copy */
+{
+	xfs_dir2_leaf_t *leaf_d;		/* destination leaf structure */
+	xfs_dir2_leaf_t *leaf_s;		/* source leaf structure */
+	int		stale;			/* count stale leaves copied */
+	xfs_trans_t	*tp;			/* transaction pointer */
+
+	xfs_dir2_trace_args_bibii("leafn_moveents", args, bp_s, start_s, bp_d,
+		start_d, count);
+	/*
+	 * Silently return if nothing to do.
+	 */
+	if (count == 0) {
+		return;
+	}
+	tp = args->trans;
+	leaf_s = bp_s->data;
+	leaf_d = bp_d->data;
+	/*
+	 * If the destination index is not the end of the current
+	 * destination leaf entries, open up a hole in the destination
+	 * to hold the new entries.
+	 */
+	if (start_d < INT_GET(leaf_d->hdr.count, ARCH_CONVERT)) {
+		ovbcopy(&leaf_d->ents[start_d], &leaf_d->ents[start_d + count],
+			(INT_GET(leaf_d->hdr.count, ARCH_CONVERT) - start_d) *
+			sizeof(xfs_dir2_leaf_entry_t));
+		xfs_dir2_leaf_log_ents(tp, bp_d, start_d + count,
+			count + INT_GET(leaf_d->hdr.count, ARCH_CONVERT) - 1);
+	}
+	/*
+	 * If the source has stale leaves, count the ones in the copy range
+	 * so we can update the header correctly.
+	 */
+	if (!INT_ISZERO(leaf_s->hdr.stale, ARCH_CONVERT)) {
+		int	i;			/* temp leaf index */
+
+		for (i = start_s, stale = 0; i < start_s + count; i++) {
+			if (INT_GET(leaf_s->ents[i].address, ARCH_CONVERT) == XFS_DIR2_NULL_DATAPTR)
+				stale++;
+		}
+	} else
+		stale = 0;
+	/*
+	 * Copy the leaf entries from source to destination.
+	 */
+	bcopy(&leaf_s->ents[start_s], &leaf_d->ents[start_d],
+		count * sizeof(xfs_dir2_leaf_entry_t));
+	xfs_dir2_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1);
+	/*
+	 * If there are source entries after the ones we copied,
+	 * delete the ones we copied by sliding the next ones down.
+	 */
+	if (start_s + count < INT_GET(leaf_s->hdr.count, ARCH_CONVERT)) {
+		ovbcopy(&leaf_s->ents[start_s + count], &leaf_s->ents[start_s],
+			count * sizeof(xfs_dir2_leaf_entry_t));
+		xfs_dir2_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1);
+	}
+	/*
+	 * Update the headers and log them.
+	 */
+	INT_MOD(leaf_s->hdr.count, ARCH_CONVERT, -(count));
+	INT_MOD(leaf_s->hdr.stale, ARCH_CONVERT, -(stale));
+	INT_MOD(leaf_d->hdr.count, ARCH_CONVERT, count);
+	INT_MOD(leaf_d->hdr.stale, ARCH_CONVERT, stale);
+	xfs_dir2_leaf_log_header(tp, bp_s);
+	xfs_dir2_leaf_log_header(tp, bp_d);
+	xfs_dir2_leafn_check(args->dp, bp_s);
+	xfs_dir2_leafn_check(args->dp, bp_d);
+}
+
+/*
+ * Determine the sort order of two leaf blocks.
+ * Returns 1 if both are valid and leaf2 should be before leaf1, else 0.
+ */
+int						/* sort order */
+xfs_dir2_leafn_order(
+	xfs_dabuf_t	*leaf1_bp,		/* leaf1 buffer */
+	xfs_dabuf_t	*leaf2_bp)		/* leaf2 buffer */
+{
+	xfs_dir2_leaf_t *leaf1;			/* leaf1 structure */
+	xfs_dir2_leaf_t *leaf2;			/* leaf2 structure */
+
+	leaf1 = leaf1_bp->data;
+	leaf2 = leaf2_bp->data;
+	ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+	if (INT_GET(leaf1->hdr.count, ARCH_CONVERT) > 0 &&
+	    INT_GET(leaf2->hdr.count, ARCH_CONVERT) > 0 &&
+	    (INT_GET(leaf2->ents[0].hashval, ARCH_CONVERT) < INT_GET(leaf1->ents[0].hashval, ARCH_CONVERT) ||
+	     INT_GET(leaf2->ents[INT_GET(leaf2->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT) <
+	     INT_GET(leaf1->ents[INT_GET(leaf1->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT)))
+		return 1;
+	return 0;
+}
+
+/*
+ * Rebalance leaf entries between two leaf blocks.
+ * This is actually only called when the second block is new,
+ * though the code deals with the general case.
+ * A new entry will be inserted in one of the blocks, and that
+ * entry is taken into account when balancing.
+ */
+static void
+xfs_dir2_leafn_rebalance(
+	xfs_da_state_t		*state,		/* btree cursor */
+	xfs_da_state_blk_t	*blk1,		/* first btree block */
+	xfs_da_state_blk_t	*blk2)		/* second btree block */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	int			count;		/* count (& direction) leaves */
+	int			isleft;		/* new goes in left leaf */
+	xfs_dir2_leaf_t		*leaf1;		/* first leaf structure */
+	xfs_dir2_leaf_t		*leaf2;		/* second leaf structure */
+	int			mid;		/* midpoint leaf index */
+#ifdef DEBUG
+	int			oldstale;	/* old count of stale leaves */
+#endif
+	int			oldsum;		/* old total leaf count */
+	int			swap;		/* swapped leaf blocks */
+
+	args = state->args;
+	/*
+	 * If the block order is wrong, swap the arguments.
+	 */
+	if ((swap = xfs_dir2_leafn_order(blk1->bp, blk2->bp))) {
+		xfs_da_state_blk_t	*tmp;	/* temp for block swap */
+
+		tmp = blk1;
+		blk1 = blk2;
+		blk2 = tmp;
+	}
+	leaf1 = blk1->bp->data;
+	leaf2 = blk2->bp->data;
+	oldsum = INT_GET(leaf1->hdr.count, ARCH_CONVERT) + INT_GET(leaf2->hdr.count, ARCH_CONVERT);
+#ifdef DEBUG
+	oldstale = INT_GET(leaf1->hdr.stale, ARCH_CONVERT) + INT_GET(leaf2->hdr.stale, ARCH_CONVERT);
+#endif
+	mid = oldsum >> 1;
+	/*
+	 * If the old leaf count was odd then the new one will be even,
+	 * so we need to divide the new count evenly.
+	 */
+	if (oldsum & 1) {
+		xfs_dahash_t	midhash;	/* middle entry hash value */
+
+		if (mid >= INT_GET(leaf1->hdr.count, ARCH_CONVERT))
+			midhash = INT_GET(leaf2->ents[mid - INT_GET(leaf1->hdr.count, ARCH_CONVERT)].hashval, ARCH_CONVERT);
+		else
+			midhash = INT_GET(leaf1->ents[mid].hashval, ARCH_CONVERT);
+		isleft = args->hashval <= midhash;
+	}
+	/*
+	 * If the old count is even then the new count is odd, so there's
+	 * no preferred side for the new entry.
+	 * Pick the left one.
+	 */
+	else
+		isleft = 1;
+	/*
+	 * Calculate moved entry count.	 Positive means left-to-right,
+	 * negative means right-to-left.  Then move the entries.
+	 */
+	count = INT_GET(leaf1->hdr.count, ARCH_CONVERT) - mid + (isleft == 0);
+	if (count > 0)
+		xfs_dir2_leafn_moveents(args, blk1->bp,
+			INT_GET(leaf1->hdr.count, ARCH_CONVERT) - count, blk2->bp, 0, count);
+	else if (count < 0)
+		xfs_dir2_leafn_moveents(args, blk2->bp, 0, blk1->bp,
+			INT_GET(leaf1->hdr.count, ARCH_CONVERT), count);
+	ASSERT(INT_GET(leaf1->hdr.count, ARCH_CONVERT) + INT_GET(leaf2->hdr.count, ARCH_CONVERT) == oldsum);
+	ASSERT(INT_GET(leaf1->hdr.stale, ARCH_CONVERT) + INT_GET(leaf2->hdr.stale, ARCH_CONVERT) == oldstale);
+	/*
+	 * Mark whether we're inserting into the old or new leaf.
+	 */
+	if (INT_GET(leaf1->hdr.count, ARCH_CONVERT) < INT_GET(leaf2->hdr.count, ARCH_CONVERT))
+		state->inleaf = swap;
+	else if (INT_GET(leaf1->hdr.count, ARCH_CONVERT) > INT_GET(leaf2->hdr.count, ARCH_CONVERT))
+		state->inleaf = !swap;
+	else
+		state->inleaf =
+			swap ^ (args->hashval < INT_GET(leaf2->ents[0].hashval, ARCH_CONVERT));
+	/*
+	 * Adjust the expected index for insertion.
+	 */
+	if (!state->inleaf)
+		blk2->index = blk1->index - INT_GET(leaf1->hdr.count, ARCH_CONVERT);
+}
+
+/*
+ * Remove an entry from a node directory.
+ * This removes the leaf entry and the data entry,
+ * and updates the free block if necessary.
+ */
+static int					/* error */
+xfs_dir2_leafn_remove(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		*bp,		/* leaf buffer */
+	int			index,		/* leaf entry index */
+	xfs_da_state_blk_t	*dblk,		/* data block */
+	int			*rval)		/* resulting block needs join */
+{
+	xfs_dir2_data_t		*data;		/* data block structure */
+	xfs_dir2_db_t		db;		/* data block number */
+	xfs_dabuf_t		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data block entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry */
+	int			longest;	/* longest data free entry */
+	int			off;		/* data block entry offset */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to rescan data frees */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	xfs_dir2_trace_args_sb("leafn_remove", args, index, bp);
+	dp = args->dp;
+	tp = args->trans;
+	mp = dp->i_mount;
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+	/*
+	 * Point to the entry we're removing.
+	 */
+	lep = &leaf->ents[index];
+	/*
+	 * Extract the data block and offset from the entry.
+	 */
+	db = XFS_DIR2_DATAPTR_TO_DB(mp, INT_GET(lep->address, ARCH_CONVERT));
+	ASSERT(dblk->blkno == db);
+	off = XFS_DIR2_DATAPTR_TO_OFF(mp, INT_GET(lep->address, ARCH_CONVERT));
+	ASSERT(dblk->index == off);
+	/*
+	 * Kill the leaf entry by marking it stale.
+	 * Log the leaf block changes.
+	 */
+	INT_MOD(leaf->hdr.stale, ARCH_CONVERT, +1);
+	xfs_dir2_leaf_log_header(tp, bp);
+	INT_SET(lep->address, ARCH_CONVERT, XFS_DIR2_NULL_DATAPTR);
+	xfs_dir2_leaf_log_ents(tp, bp, index, index);
+	/*
+	 * Make the data entry free.  Keep track of the longest freespace
+	 * in the data block in case it changes.
+	 */
+	dbp = dblk->bp;
+	data = dbp->data;
+	dep = (xfs_dir2_data_entry_t *)((char *)data + off);
+	longest = INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT);
+	needlog = needscan = 0;
+	xfs_dir2_data_make_free(tp, dbp, off,
+		XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan);
+	/*
+	 * Rescan the data block freespaces for bestfree.
+	 * Log the data block header if needed.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(mp, data, &needlog, NULL);
+	if (needlog)
+		xfs_dir2_data_log_header(tp, dbp);
+	xfs_dir2_data_check(dp, dbp);
+	/*
+	 * If the longest data block freespace changes, need to update
+	 * the corresponding freeblock entry.
+	 */
+	if (longest < INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT)) {
+		int		error;		/* error return value */
+		xfs_dabuf_t	*fbp;		/* freeblock buffer */
+		xfs_dir2_db_t	fdb;		/* freeblock block number */
+		int		findex;		/* index in freeblock entries */
+		xfs_dir2_free_t *free;		/* freeblock structure */
+		int		logfree;	/* need to log free entry */
+
+		/*
+		 * Convert the data block number to a free block,
+		 * read in the free block.
+		 */
+		fdb = XFS_DIR2_DB_TO_FDB(mp, db);
+		if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fdb),
+				-1, &fbp, XFS_DATA_FORK))) {
+			return error;
+		}
+		free = fbp->data;
+		ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+		ASSERT(INT_GET(free->hdr.firstdb, ARCH_CONVERT) ==
+		       XFS_DIR2_MAX_FREE_BESTS(mp) *
+		       (fdb - XFS_DIR2_FREE_FIRSTDB(mp)));
+		/*
+		 * Calculate which entry we need to fix.
+		 */
+		findex = XFS_DIR2_DB_TO_FDINDEX(mp, db);
+		longest = INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT);
+		/*
+		 * If the data block is now empty we can get rid of it
+		 * (usually).
+		 */
+		if (longest == mp->m_dirblksize - (uint)sizeof(data->hdr)) {
+			/*
+			 * Try to punch out the data block.
+			 */
+			error = xfs_dir2_shrink_inode(args, db, dbp);
+			if (error == 0) {
+				dblk->bp = NULL;
+				data = NULL;
+			}
+			/*
+			 * We can get ENOSPC if there's no space reservation.
+			 * In this case just drop the buffer and some one else
+			 * will eventually get rid of the empty block.
+			 */
+			else if (error == ENOSPC && args->total == 0)
+				xfs_da_buf_done(dbp);
+			else
+				return error;
+		}
+		/*
+		 * If we got rid of the data block, we can eliminate that entry
+		 * in the free block.
+		 */
+		if (data == NULL) {
+			/*
+			 * One less used entry in the free table.
+			 */
+			INT_MOD(free->hdr.nused, ARCH_CONVERT, -1);
+			xfs_dir2_free_log_header(tp, fbp);
+			/*
+			 * If this was the last entry in the table, we can
+			 * trim the table size back.  There might be other
+			 * entries at the end referring to non-existent
+			 * data blocks, get those too.
+			 */
+			if (findex == INT_GET(free->hdr.nvalid, ARCH_CONVERT) - 1) {
+				int	i;		/* free entry index */
+
+				for (i = findex - 1;
+				     i >= 0 && INT_GET(free->bests[i], ARCH_CONVERT) == NULLDATAOFF;
+				     i--)
+					continue;
+				INT_SET(free->hdr.nvalid, ARCH_CONVERT, i + 1);
+				logfree = 0;
+			}
+			/*
+			 * Not the last entry, just punch it out.
+			 */
+			else {
+				INT_SET(free->bests[findex], ARCH_CONVERT, NULLDATAOFF);
+				logfree = 1;
+			}
+			/*
+			 * If there are no useful entries left in the block,
+			 * get rid of the block if we can.
+			 */
+			if (INT_ISZERO(free->hdr.nused, ARCH_CONVERT)) {
+				error = xfs_dir2_shrink_inode(args, fdb, fbp);
+				if (error == 0) {
+					fbp = NULL;
+					logfree = 0;
+				} else if (error != ENOSPC || args->total != 0)
+					return error;
+				/*
+				 * It's possible to get ENOSPC if there is no
+				 * space reservation.  In this case some one
+				 * else will eventually get rid of this block.
+				 */
+			}
+		}
+		/*
+		 * Data block is not empty, just set the free entry to
+		 * the new value.
+		 */
+		else {
+			INT_SET(free->bests[findex], ARCH_CONVERT, longest);
+			logfree = 1;
+		}
+		/*
+		 * Log the free entry that changed, unless we got rid of it.
+		 */
+		if (logfree)
+			xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+		/*
+		 * Drop the buffer if we still have it.
+		 */
+		if (fbp)
+			xfs_da_buf_done(fbp);
+	}
+	xfs_dir2_leafn_check(dp, bp);
+	/*
+	 * Return indication of whether this leaf block is emtpy enough
+	 * to justify trying to join it with a neighbor.
+	 */
+	*rval =
+		((uint)sizeof(leaf->hdr) +
+		 (uint)sizeof(leaf->ents[0]) *
+		 (INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT))) <
+		mp->m_dir_magicpct;
+	return 0;
+}
+
+/*
+ * Split the leaf entries in the old block into old and new blocks.
+ */
+int						/* error */
+xfs_dir2_leafn_split(
+	xfs_da_state_t		*state,		/* btree cursor */
+	xfs_da_state_blk_t	*oldblk,	/* original block */
+	xfs_da_state_blk_t	*newblk)	/* newly created block */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	xfs_dablk_t		blkno;		/* new leaf block number */
+	int			error;		/* error return value */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+
+	/*
+	 * Allocate space for a new leaf node.
+	 */
+	args = state->args;
+	mp = args->dp->i_mount;
+	ASSERT(args != NULL);
+	ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC);
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Initialize the new leaf block.
+	 */
+	error = xfs_dir2_leaf_init(args, XFS_DIR2_DA_TO_DB(mp, blkno),
+		&newblk->bp, XFS_DIR2_LEAFN_MAGIC);
+	if (error) {
+		return error;
+	}
+	newblk->blkno = blkno;
+	newblk->magic = XFS_DIR2_LEAFN_MAGIC;
+	/*
+	 * Rebalance the entries across the two leaves, link the new
+	 * block into the leaves.
+	 */
+	xfs_dir2_leafn_rebalance(state, oldblk, newblk);
+	error = xfs_da_blk_link(state, oldblk, newblk);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Insert the new entry in the correct block.
+	 */
+	if (state->inleaf)
+		error = xfs_dir2_leafn_add(oldblk->bp, args, oldblk->index);
+	else
+		error = xfs_dir2_leafn_add(newblk->bp, args, newblk->index);
+	/*
+	 * Update last hashval in each block since we added the name.
+	 */
+	oldblk->hashval = xfs_dir2_leafn_lasthash(oldblk->bp, NULL);
+	newblk->hashval = xfs_dir2_leafn_lasthash(newblk->bp, NULL);
+	xfs_dir2_leafn_check(args->dp, oldblk->bp);
+	xfs_dir2_leafn_check(args->dp, newblk->bp);
+	return error;
+}
+
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+int						/* error */
+xfs_dir2_leafn_toosmall(
+	xfs_da_state_t		*state,		/* btree cursor */
+	int			*action)	/* resulting action to take */
+{
+	xfs_da_state_blk_t	*blk;		/* leaf block */
+	xfs_dablk_t		blkno;		/* leaf block number */
+	xfs_dabuf_t		*bp;		/* leaf buffer */
+	int			bytes;		/* bytes in use */
+	int			count;		/* leaf live entry count */
+	int			error;		/* error return value */
+	int			forward;	/* sibling block direction */
+	int			i;		/* sibling counter */
+	xfs_da_blkinfo_t	*info;		/* leaf block header */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	int			rval;		/* result from path_shift */
+
+	/*
+	 * Check for the degenerate case of the block being over 50% full.
+	 * If so, it's not worth even looking to see if we might be able
+	 * to coalesce with a sibling.
+	 */
+	blk = &state->path.blk[state->path.active - 1];
+	info = blk->bp->data;
+	ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+	leaf = (xfs_dir2_leaf_t *)info;
+	count = INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT);
+	bytes = (uint)sizeof(leaf->hdr) + count * (uint)sizeof(leaf->ents[0]);
+	if (bytes > (state->blocksize >> 1)) {
+		/*
+		 * Blk over 50%, don't try to join.
+		 */
+		*action = 0;
+		return 0;
+	}
+	/*
+	 * Check for the degenerate case of the block being empty.
+	 * If the block is empty, we'll simply delete it, no need to
+	 * coalesce it with a sibling block.  We choose (arbitrarily)
+	 * to merge with the forward block unless it is NULL.
+	 */
+	if (count == 0) {
+		/*
+		 * Make altpath point to the block we want to keep and
+		 * path point to the block we want to drop (this one).
+		 */
+		forward = !INT_ISZERO(info->forw, ARCH_CONVERT);
+		bcopy(&state->path, &state->altpath, sizeof(state->path));
+		error = xfs_da_path_shift(state, &state->altpath, forward, 0,
+			&rval);
+		if (error)
+			return error;
+		*action = rval ? 2 : 0;
+		return 0;
+	}
+	/*
+	 * Examine each sibling block to see if we can coalesce with
+	 * at least 25% free space to spare.  We need to figure out
+	 * whether to merge with the forward or the backward block.
+	 * We prefer coalescing with the lower numbered sibling so as
+	 * to shrink a directory over time.
+	 */
+	forward = INT_GET(info->forw, ARCH_CONVERT) < INT_GET(info->back, ARCH_CONVERT);
+	for (i = 0, bp = NULL; i < 2; forward = !forward, i++) {
+		blkno = forward ?INT_GET( info->forw, ARCH_CONVERT) : INT_GET(info->back, ARCH_CONVERT);
+		if (blkno == 0)
+			continue;
+		/*
+		 * Read the sibling leaf block.
+		 */
+		if ((error =
+		    xfs_da_read_buf(state->args->trans, state->args->dp, blkno,
+			    -1, &bp, XFS_DATA_FORK))) {
+			return error;
+		}
+		ASSERT(bp != NULL);
+		/*
+		 * Count bytes in the two blocks combined.
+		 */
+		leaf = (xfs_dir2_leaf_t *)info;
+		count = INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT);
+		bytes = state->blocksize - (state->blocksize >> 2);
+		leaf = bp->data;
+		ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+		count += INT_GET(leaf->hdr.count, ARCH_CONVERT) - INT_GET(leaf->hdr.stale, ARCH_CONVERT);
+		bytes -= count * (uint)sizeof(leaf->ents[0]);
+		/*
+		 * Fits with at least 25% to spare.
+		 */
+		if (bytes >= 0)
+			break;
+		xfs_da_brelse(state->args->trans, bp);
+	}
+	/*
+	 * Didn't like either block, give up.
+	 */
+	if (i >= 2) {
+		*action = 0;
+		return 0;
+	}
+	/*
+	 * Done with the sibling leaf block here, drop the dabuf
+	 * so path_shift can get it.
+	 */
+	xfs_da_buf_done(bp);
+	/*
+	 * Make altpath point to the block we want to keep (the lower
+	 * numbered block) and path point to the block we want to drop.
+	 */
+	bcopy(&state->path, &state->altpath, sizeof(state->path));
+	if (blkno < blk->blkno)
+		error = xfs_da_path_shift(state, &state->altpath, forward, 0,
+			&rval);
+	else
+		error = xfs_da_path_shift(state, &state->path, forward, 0,
+			&rval);
+	if (error) {
+		return error;
+	}
+	*action = rval ? 0 : 1;
+	return 0;
+}
+
+/*
+ * Move all the leaf entries from drop_blk to save_blk.
+ * This is done as part of a join operation.
+ */
+void
+xfs_dir2_leafn_unbalance(
+	xfs_da_state_t		*state,		/* cursor */
+	xfs_da_state_blk_t	*drop_blk,	/* dead block */
+	xfs_da_state_blk_t	*save_blk)	/* surviving block */
+{
+	xfs_da_args_t		*args;		/* operation arguments */
+	xfs_dir2_leaf_t		*drop_leaf;	/* dead leaf structure */
+	xfs_dir2_leaf_t		*save_leaf;	/* surviving leaf structure */
+
+	args = state->args;
+	ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	drop_leaf = drop_blk->bp->data;
+	save_leaf = save_blk->bp->data;
+	ASSERT(INT_GET(drop_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(INT_GET(save_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR2_LEAFN_MAGIC);
+	/*
+	 * If there are any stale leaf entries, take this opportunity
+	 * to purge them.
+	 */
+	if (INT_GET(drop_leaf->hdr.stale, ARCH_CONVERT))
+		xfs_dir2_leaf_compact(args, drop_blk->bp);
+	if (INT_GET(save_leaf->hdr.stale, ARCH_CONVERT))
+		xfs_dir2_leaf_compact(args, save_blk->bp);
+	/*
+	 * Move the entries from drop to the appropriate end of save.
+	 */
+	drop_blk->hashval = INT_GET(drop_leaf->ents[INT_GET(drop_leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
+	if (xfs_dir2_leafn_order(save_blk->bp, drop_blk->bp))
+		xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, 0,
+			INT_GET(drop_leaf->hdr.count, ARCH_CONVERT));
+	else
+		xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp,
+			INT_GET(save_leaf->hdr.count, ARCH_CONVERT), INT_GET(drop_leaf->hdr.count, ARCH_CONVERT));
+	save_blk->hashval = INT_GET(save_leaf->ents[INT_GET(save_leaf->hdr.count, ARCH_CONVERT) - 1].hashval, ARCH_CONVERT);
+	xfs_dir2_leafn_check(args->dp, save_blk->bp);
+}
+
+/*
+ * Top-level node form directory addname routine.
+ */
+int						/* error */
+xfs_dir2_node_addname(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_da_state_blk_t	*blk;		/* leaf block for insert */
+	int			error;		/* error return value */
+	int			rval;		/* sub-return value */
+	xfs_da_state_t		*state;		/* btree cursor */
+
+	xfs_dir2_trace_args("node_addname", args);
+	/*
+	 * Allocate and initialize the state (btree cursor).
+	 */
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	state->blocksize = state->mp->m_dirblksize;
+	/*
+	 * Look up the name.  We're not supposed to find it, but
+	 * this gives us the insertion point.
+	 */
+	error = xfs_da_node_lookup_int(state, &rval);
+	if (error)
+		rval = error;
+	if (rval != ENOENT) {
+		goto done;
+	}
+	/*
+	 * Add the data entry to a data block.
+	 * Extravalid is set to a freeblock found by lookup.
+	 */
+	rval = xfs_dir2_node_addname_int(args,
+		state->extravalid ? &state->extrablk : NULL);
+	if (rval) {
+		goto done;
+	}
+	blk = &state->path.blk[state->path.active - 1];
+	ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	/*
+	 * Add the new leaf entry.
+	 */
+	rval = xfs_dir2_leafn_add(blk->bp, args, blk->index);
+	if (rval == 0) {
+		/*
+		 * It worked, fix the hash values up the btree.
+		 */
+		if (!args->justcheck)
+			xfs_da_fixhashpath(state, &state->path);
+	} else {
+		/*
+		 * It didn't work, we need to split the leaf block.
+		 */
+		if (args->total == 0) {
+			ASSERT(rval == ENOSPC);
+			goto done;
+		}
+		/*
+		 * Split the leaf block and insert the new entry.
+		 */
+		rval = xfs_da_split(state);
+	}
+done:
+	xfs_da_state_free(state);
+	return rval;
+}
+
+/*
+ * Add the data entry for a node-format directory name addition.
+ * The leaf entry is added in xfs_dir2_leafn_add.
+ * We may enter with a freespace block that the lookup found.
+ */
+static int					/* error */
+xfs_dir2_node_addname_int(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_da_state_blk_t	*fblk)		/* optional freespace block */
+{
+	xfs_dir2_data_t		*data;		/* data block structure */
+	xfs_dir2_db_t		dbno;		/* data block number */
+	xfs_dabuf_t		*dbp;		/* data block buffer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry pointer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* data unused entry pointer */
+	int			error;		/* error return value */
+	xfs_dir2_db_t		fbno;		/* freespace block number */
+	xfs_dabuf_t		*fbp;		/* freespace buffer */
+	int			findex;		/* freespace entry index */
+	xfs_dir2_db_t		foundbno=0;	/* found freespace block no */
+	int			foundindex=0;	/* found freespace entry idx */
+	xfs_dir2_free_t		*free=NULL;	/* freespace block structure */
+	xfs_dir2_db_t		ifbno;		/* initial freespace block no */
+	xfs_dir2_db_t		lastfbno=0;	/* highest freespace block no */
+	int			length;		/* length of the new entry */
+	int			logfree;	/* need to log free entry */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	int			needlog;	/* need to log data header */
+	int			needscan;	/* need to rescan data frees */
+	xfs_dir2_data_off_t	*tagp;		/* data entry tag pointer */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	length = XFS_DIR2_DATA_ENTSIZE(args->namelen);
+	/*
+	 * If we came in with a freespace block that means that lookup
+	 * found an entry with our hash value.	This is the freespace
+	 * block for that data entry.
+	 */
+	if (fblk) {
+		fbp = fblk->bp;
+		/*
+		 * Remember initial freespace block number.
+		 */
+		ifbno = fblk->blkno;
+		free = fbp->data;
+		ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+		findex = fblk->index;
+		/*
+		 * This means the free entry showed that the data block had
+		 * space for our entry, so we remembered it.
+		 * Use that data block.
+		 */
+		if (findex >= 0) {
+			ASSERT(findex < INT_GET(free->hdr.nvalid, ARCH_CONVERT));
+			ASSERT(INT_GET(free->bests[findex], ARCH_CONVERT) != NULLDATAOFF);
+			ASSERT(INT_GET(free->bests[findex], ARCH_CONVERT) >= length);
+			dbno = INT_GET(free->hdr.firstdb, ARCH_CONVERT) + findex;
+		}
+		/*
+		 * The data block looked at didn't have enough room.
+		 * We'll start at the beginning of the freespace entries.
+		 */
+		else {
+			dbno = -1;
+			findex = 0;
+		}
+	}
+	/*
+	 * Didn't come in with a freespace block, so don't have a data block.
+	 */
+	else {
+		ifbno = dbno = -1;
+		fbp = NULL;
+		findex = 0;
+	}
+	/*
+	 * If we don't have a data block yet, we're going to scan the
+	 * freespace blocks looking for one.  Figure out what the
+	 * highest freespace block number is.
+	 */
+	if (dbno == -1) {
+		xfs_fileoff_t	fo;		/* freespace block number */
+
+		if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK)))
+			return error;
+		lastfbno = XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)fo);
+		fbno = ifbno;
+		foundindex = -1;
+	}
+	/*
+	 * While we haven't identified a data block, search the freeblock
+	 * data for a good data block.	If we find a null freeblock entry,
+	 * indicating a hole in the data blocks, remember that.
+	 */
+	while (dbno == -1) {
+		/*
+		 * If we don't have a freeblock in hand, get the next one.
+		 */
+		if (fbp == NULL) {
+			/*
+			 * Happens the first time through unless lookup gave
+			 * us a freespace block to start with.
+			 */
+			if (++fbno == 0)
+				fbno = XFS_DIR2_FREE_FIRSTDB(mp);
+			/*
+			 * If it's ifbno we already looked at it.
+			 */
+			if (fbno == ifbno)
+				fbno++;
+			/*
+			 * If it's off the end we're done.
+			 */
+			if (fbno >= lastfbno)
+				break;
+			/*
+			 * Read the block.  There can be holes in the
+			 * freespace blocks, so this might not succeed.
+			 * This should be really rare, so there's no reason
+			 * to avoid it.
+			 */
+			if ((error = xfs_da_read_buf(tp, dp,
+					XFS_DIR2_DB_TO_DA(mp, fbno), -1, &fbp,
+					XFS_DATA_FORK))) {
+				return error;
+			}
+			if (fbp == NULL) {
+				continue;
+			}
+			free = fbp->data;
+			ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+			findex = 0;
+		}
+		/*
+		 * Look at the current free entry.  Is it good enough?
+		 */
+		if (INT_GET(free->bests[findex], ARCH_CONVERT) != NULLDATAOFF &&
+		    INT_GET(free->bests[findex], ARCH_CONVERT) >= length)
+			dbno = INT_GET(free->hdr.firstdb, ARCH_CONVERT) + findex;
+		else {
+			/*
+			 * If we haven't found an empty entry yet, and this
+			 * one is empty, remember this slot.
+			 */
+			if (foundindex == -1 &&
+			    INT_GET(free->bests[findex], ARCH_CONVERT) == NULLDATAOFF) {
+				foundindex = findex;
+				foundbno = fbno;
+			}
+			/*
+			 * Are we done with the freeblock?
+			 */
+			if (++findex == INT_GET(free->hdr.nvalid, ARCH_CONVERT)) {
+				/*
+				 * If there is space left in this freeblock,
+				 * and we don't have an empty entry yet,
+				 * remember this slot.
+				 */
+				if (foundindex == -1 &&
+				    findex < XFS_DIR2_MAX_FREE_BESTS(mp)) {
+					foundindex = findex;
+					foundbno = fbno;
+				}
+				/*
+				 * Drop the block.
+				 */
+				xfs_da_brelse(tp, fbp);
+				fbp = NULL;
+				if (fblk && fblk->bp)
+					fblk->bp = NULL;
+			}
+		}
+	}
+	/*
+	 * If we don't have a data block, and there's no free slot in a
+	 * freeblock, we need to add a new freeblock.
+	 */
+	if (dbno == -1 && foundindex == -1) {
+		/*
+		 * Not allowed to allocate, so return failure.
+		 */
+		if (args->justcheck || args->total == 0) {
+			return XFS_ERROR(ENOSPC);
+		}
+		/*
+		 * Add the new freeblock.
+		 */
+		if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE,
+				&fbno))) {
+			return error;
+		}
+		/*
+		 * Get a buffer for the new block.
+		 */
+		if ((error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fbno),
+				-1, &fbp, XFS_DATA_FORK))) {
+			return error;
+		}
+		ASSERT(fbp != NULL);
+		/*
+		 * Initialize the new block to be empty, and remember
+		 * its first slot as our empty slot.
+		 */
+		free = fbp->data;
+		INT_SET(free->hdr.magic, ARCH_CONVERT, XFS_DIR2_FREE_MAGIC);
+		INT_SET(free->hdr.firstdb, ARCH_CONVERT, (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) *
+			XFS_DIR2_MAX_FREE_BESTS(mp));
+		INT_ZERO(free->hdr.nused, ARCH_CONVERT);
+		INT_ZERO(free->hdr.nvalid, ARCH_CONVERT);
+		foundindex = 0;
+		foundbno = fbno;
+	}
+	/*
+	 * If we don't have a data block, and we don't have a freeblock buffer
+	 * in hand (we dropped the one with the free slot in it),
+	 * go read the freeblock again.
+	 */
+	if (dbno == -1 && fbp == NULL) {
+		/*
+		 * We're going to use the empty slot we found before.
+		 */
+		findex = foundindex;
+		fbno = foundbno;
+		if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fbno),
+				-1, &fbp, XFS_DATA_FORK))) {
+			return error;
+		}
+		ASSERT(fbp != NULL);
+		free = fbp->data;
+		ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+	}
+	/*
+	 * If we don't have a data block, we need to allocate one and make
+	 * the freespace entries refer to it.
+	 */
+	if (dbno == -1) {
+		/*
+		 * Not allowed to allocate, return failure.
+		 */
+		if (args->justcheck || args->total == 0) {
+			/*
+			 * Drop the freespace buffer unless it came from our
+			 * caller.
+			 */
+			if (fblk == NULL || fblk->bp == NULL)
+				xfs_da_buf_done(fbp);
+			return XFS_ERROR(ENOSPC);
+		}
+		/*
+		 * Allocate and initialize the new data block.
+		 */
+		if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE,
+				&dbno)) ||
+		    (error = xfs_dir2_data_init(args, dbno, &dbp))) {
+			/*
+			 * Drop the freespace buffer unless it came from our
+			 * caller.
+			 */
+			if (fblk == NULL || fblk->bp == NULL)
+				xfs_da_buf_done(fbp);
+			return error;
+		}
+		/*
+		 * If the freespace entry for this data block is not in the
+		 * freespace block we have in hand, drop the one we have
+		 * and get the right one.
+		 */
+		if (XFS_DIR2_DB_TO_FDB(mp, dbno) != fbno) {
+			xfs_da_brelse(tp, fbp);
+			if (fblk && fblk->bp)
+				fblk->bp = NULL;
+			fbno = XFS_DIR2_DB_TO_FDB(mp, dbno);
+			if ((error = xfs_da_read_buf(tp, dp,
+					XFS_DIR2_DB_TO_DA(mp, fbno), -1, &fbp,
+					XFS_DATA_FORK))) {
+				xfs_da_buf_done(dbp);
+				return error;
+			}
+			ASSERT(fbp != NULL);
+			free = fbp->data;
+			ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+		}
+		/*
+		 * Set the freespace block index from the data block number.
+		 */
+		findex = XFS_DIR2_DB_TO_FDINDEX(mp, dbno);
+		/*
+		 * If it's after the end of the current entries in the
+		 * freespace block, extend that table.
+		 */
+		if (findex >= INT_GET(free->hdr.nvalid, ARCH_CONVERT)) {
+			ASSERT(findex < XFS_DIR2_MAX_FREE_BESTS(mp));
+			INT_SET(free->hdr.nvalid, ARCH_CONVERT, findex + 1);
+			/*
+			 * Tag new entry so nused will go up.
+			 */
+			INT_SET(free->bests[findex], ARCH_CONVERT, NULLDATAOFF);
+		}
+		/*
+		 * If this entry was for an empty data block
+		 * (this should always be true) then update the header.
+		 */
+		if (INT_GET(free->bests[findex], ARCH_CONVERT) == NULLDATAOFF) {
+			INT_MOD(free->hdr.nused, ARCH_CONVERT, +1);
+			xfs_dir2_free_log_header(tp, fbp);
+		}
+		/*
+		 * Update the real value in the table.
+		 * We haven't allocated the data entry yet so this will
+		 * change again.
+		 */
+		data = dbp->data;
+		INT_COPY(free->bests[findex], data->hdr.bestfree[0].length, ARCH_CONVERT);
+		logfree = 1;
+	}
+	/*
+	 * We had a data block so we don't have to make a new one.
+	 */
+	else {
+		/*
+		 * If just checking, we succeeded.
+		 */
+		if (args->justcheck) {
+			if (fblk == NULL || fblk->bp == NULL)
+				xfs_da_buf_done(fbp);
+			return 0;
+		}
+		/*
+		 * Read the data block in.
+		 */
+		if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, dbno),
+				-1, &dbp, XFS_DATA_FORK))) {
+			if (fblk == NULL || fblk->bp == NULL)
+				xfs_da_buf_done(fbp);
+			return error;
+		}
+		data = dbp->data;
+		logfree = 0;
+	}
+	ASSERT(INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT) >= length);
+	/*
+	 * Point to the existing unused space.
+	 */
+	dup = (xfs_dir2_data_unused_t *)
+	      ((char *)data + INT_GET(data->hdr.bestfree[0].offset, ARCH_CONVERT));
+	needscan = needlog = 0;
+	/*
+	 * Mark the first part of the unused space, inuse for us.
+	 */
+	xfs_dir2_data_use_free(tp, dbp, dup,
+		(xfs_dir2_data_aoff_t)((char *)dup - (char *)data), length,
+		&needlog, &needscan);
+	/*
+	 * Fill in the new entry and log it.
+	 */
+	dep = (xfs_dir2_data_entry_t *)dup;
+	INT_SET(dep->inumber, ARCH_CONVERT, args->inumber);
+	dep->namelen = args->namelen;
+	bcopy(args->name, dep->name, dep->namelen);
+	tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep);
+	INT_SET(*tagp, ARCH_CONVERT, (xfs_dir2_data_off_t)((char *)dep - (char *)data));
+	xfs_dir2_data_log_entry(tp, dbp, dep);
+	/*
+	 * Rescan the block for bestfree if needed.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(mp, data, &needlog, NULL);
+	/*
+	 * Log the data block header if needed.
+	 */
+	if (needlog)
+		xfs_dir2_data_log_header(tp, dbp);
+	/*
+	 * If the freespace entry is now wrong, update it.
+	 */
+	if (INT_GET(free->bests[findex], ARCH_CONVERT) != INT_GET(data->hdr.bestfree[0].length, ARCH_CONVERT)) {
+		INT_COPY(free->bests[findex], data->hdr.bestfree[0].length, ARCH_CONVERT);
+		logfree = 1;
+	}
+	/*
+	 * Log the freespace entry if needed.
+	 */
+	if (logfree)
+		xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+	/*
+	 * If the caller didn't hand us the freespace block, drop it.
+	 */
+	if (fblk == NULL || fblk->bp == NULL)
+		xfs_da_buf_done(fbp);
+	/*
+	 * Return the data block and offset in args, then drop the data block.
+	 */
+	args->blkno = (xfs_dablk_t)dbno;
+	args->index = INT_GET(*tagp, ARCH_CONVERT);
+	xfs_da_buf_done(dbp);
+	return 0;
+}
+
+/*
+ * Lookup an entry in a node-format directory.
+ * All the real work happens in xfs_da_node_lookup_int.
+ * The only real output is the inode number of the entry.
+ */
+int						/* error */
+xfs_dir2_node_lookup(
+	xfs_da_args_t	*args)			/* operation arguments */
+{
+	int		error;			/* error return value */
+	int		i;			/* btree level */
+	int		rval;			/* operation return value */
+	xfs_da_state_t	*state;			/* btree cursor */
+
+	xfs_dir2_trace_args("node_lookup", args);
+	/*
+	 * Allocate and initialize the btree cursor.
+	 */
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	state->blocksize = state->mp->m_dirblksize;
+	/*
+	 * Fill in the path to the entry in the cursor.
+	 */
+	error = xfs_da_node_lookup_int(state, &rval);
+	if (error)
+		rval = error;
+	/*
+	 * Release the btree blocks and leaf block.
+	 */
+	for (i = 0; i < state->path.active; i++) {
+		xfs_da_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+	/*
+	 * Release the data block if we have it.
+	 */
+	if (state->extravalid && state->extrablk.bp) {
+		xfs_da_brelse(args->trans, state->extrablk.bp);
+		state->extrablk.bp = NULL;
+	}
+	xfs_da_state_free(state);
+	return rval;
+}
+
+/*
+ * Remove an entry from a node-format directory.
+ */
+int						/* error */
+xfs_dir2_node_removename(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_da_state_blk_t	*blk;		/* leaf block */
+	int			error;		/* error return value */
+	int			rval;		/* operation return value */
+	xfs_da_state_t		*state;		/* btree cursor */
+
+	xfs_dir2_trace_args("node_removename", args);
+	/*
+	 * Allocate and initialize the btree cursor.
+	 */
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	state->blocksize = state->mp->m_dirblksize;
+	/*
+	 * Look up the entry we're deleting, set up the cursor.
+	 */
+	error = xfs_da_node_lookup_int(state, &rval);
+	if (error) {
+		rval = error;
+	}
+	/*
+	 * Didn't find it, upper layer screwed up.
+	 */
+	if (rval != EEXIST) {
+		xfs_da_state_free(state);
+		return rval;
+	}
+	blk = &state->path.blk[state->path.active - 1];
+	ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+	ASSERT(state->extravalid);
+	/*
+	 * Remove the leaf and data entries.
+	 * Extrablk refers to the data block.
+	 */
+	error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
+		&state->extrablk, &rval);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Fix the hash values up the btree.
+	 */
+	xfs_da_fixhashpath(state, &state->path);
+	/*
+	 * If we need to join leaf blocks, do it.
+	 */
+	if (rval && state->path.active > 1)
+		error = xfs_da_join(state);
+	/*
+	 * If no errors so far, try conversion to leaf format.
+	 */
+	if (!error)
+		error = xfs_dir2_node_to_leaf(state);
+	xfs_da_state_free(state);
+	return error;
+}
+
+/*
+ * Replace an entry's inode number in a node-format directory.
+ */
+int						/* error */
+xfs_dir2_node_replace(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_da_state_blk_t	*blk;		/* leaf block */
+	xfs_dir2_data_t		*data;		/* data block structure */
+	xfs_dir2_data_entry_t	*dep;		/* data entry changed */
+	int			error;		/* error return value */
+	int			i;		/* btree level */
+	xfs_ino_t		inum;		/* new inode number */
+	xfs_dir2_leaf_t		*leaf;		/* leaf structure */
+	xfs_dir2_leaf_entry_t	*lep;		/* leaf entry being changed */
+	int			rval;		/* internal return value */
+	xfs_da_state_t		*state;		/* btree cursor */
+
+	xfs_dir2_trace_args("node_replace", args);
+	/*
+	 * Allocate and initialize the btree cursor.
+	 */
+	state = xfs_da_state_alloc();
+	state->args = args;
+	state->mp = args->dp->i_mount;
+	state->blocksize = state->mp->m_dirblksize;
+	inum = args->inumber;
+	/*
+	 * Lookup the entry to change in the btree.
+	 */
+	error = xfs_da_node_lookup_int(state, &rval);
+	if (error) {
+		rval = error;
+	}
+	/*
+	 * It should be found, since the vnodeops layer has looked it up
+	 * and locked it.  But paranoia is good.
+	 */
+	if (rval == EEXIST) {
+		/*
+		 * Find the leaf entry.
+		 */
+		blk = &state->path.blk[state->path.active - 1];
+		ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
+		leaf = blk->bp->data;
+		lep = &leaf->ents[blk->index];
+		ASSERT(state->extravalid);
+		/*
+		 * Point to the data entry.
+		 */
+		data = state->extrablk.bp->data;
+		ASSERT(INT_GET(data->hdr.magic, ARCH_CONVERT) == XFS_DIR2_DATA_MAGIC);
+		dep = (xfs_dir2_data_entry_t *)
+		      ((char *)data +
+		       XFS_DIR2_DATAPTR_TO_OFF(state->mp, INT_GET(lep->address, ARCH_CONVERT)));
+		ASSERT(inum != INT_GET(dep->inumber, ARCH_CONVERT));
+		/*
+		 * Fill in the new inode number and log the entry.
+		 */
+		INT_SET(dep->inumber, ARCH_CONVERT, inum);
+		xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep);
+		rval = 0;
+	}
+	/*
+	 * Didn't find it, and we're holding a data block.  Drop it.
+	 */
+	else if (state->extravalid) {
+		xfs_da_brelse(args->trans, state->extrablk.bp);
+		state->extrablk.bp = NULL;
+	}
+	/*
+	 * Release all the buffers in the cursor.
+	 */
+	for (i = 0; i < state->path.active; i++) {
+		xfs_da_brelse(args->trans, state->path.blk[i].bp);
+		state->path.blk[i].bp = NULL;
+	}
+	xfs_da_state_free(state);
+	return rval;
+}
+
+/*
+ * Trim off a trailing empty freespace block.
+ * Return (in rvalp) 1 if we did it, 0 if not.
+ */
+int						/* error */
+xfs_dir2_node_trim_free(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_fileoff_t		fo,		/* free block number */
+	int			*rvalp)		/* out: did something */
+{
+	xfs_dabuf_t		*bp;		/* freespace buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return code */
+	xfs_dir2_free_t		*free;		/* freespace structure */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_trans_t		*tp;		/* transaction pointer */
+
+	dp = args->dp;
+	mp = dp->i_mount;
+	tp = args->trans;
+	/*
+	 * Read the freespace block.
+	 */
+	if ((error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -1, &bp,
+			XFS_DATA_FORK))) {
+		return error;
+	}
+	free = bp->data;
+	ASSERT(INT_GET(free->hdr.magic, ARCH_CONVERT) == XFS_DIR2_FREE_MAGIC);
+	/*
+	 * If there are used entries, there's nothing to do.
+	 */
+	if (INT_GET(free->hdr.nused, ARCH_CONVERT) > 0) {
+		xfs_da_brelse(tp, bp);
+		*rvalp = 0;
+		return 0;
+	}
+	/*
+	 * Blow the block away.
+	 */
+	if ((error =
+	    xfs_dir2_shrink_inode(args, XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)fo),
+		    bp))) {
+		/*
+		 * Can't fail with ENOSPC since that only happens with no
+		 * space reservation, when breaking up an extent into two
+		 * pieces.  This is the last block of an extent.
+		 */
+		ASSERT(error != ENOSPC);
+		xfs_da_brelse(tp, bp);
+		return error;
+	}
+	/*
+	 * Return that we succeeded.
+	 */
+	*rvalp = 1;
+	return 0;
+}
diff --git a/fs/xfs/xfs_dir2_node.h b/fs/xfs/xfs_dir2_node.h
new file mode 100644
index 000000000000..7afa4ed4f071
--- /dev/null
+++ b/fs/xfs/xfs_dir2_node.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR2_NODE_H__
+#define __XFS_DIR2_NODE_H__
+
+/*
+ * Directory version 2, btree node format structures
+ */
+
+struct dirent;
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_da_state;
+struct xfs_da_state_blk;
+struct xfs_inode;
+struct xfs_trans;
+
+/*
+ * Constants.
+ */
+
+/*
+ * Offset of the freespace index.
+ */
+#define XFS_DIR2_FREE_SPACE	2
+#define XFS_DIR2_FREE_OFFSET	(XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE)
+#define XFS_DIR2_FREE_FIRSTDB(mp)	\
+	XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_FREE_OFFSET)
+
+#define XFS_DIR2_FREE_MAGIC	0x58443246	/* XD2F */
+
+/*
+ * Structures.
+ */
+typedef struct xfs_dir2_free_hdr {
+	__uint32_t		magic;		/* XFS_DIR2_FREE_MAGIC */
+	__int32_t		firstdb;	/* db of first entry */
+	__int32_t		nvalid;		/* count of valid entries */
+	__int32_t		nused;		/* count of used entries */
+} xfs_dir2_free_hdr_t;
+
+typedef struct xfs_dir2_free {
+	xfs_dir2_free_hdr_t	hdr;		/* block header */
+	xfs_dir2_data_off_t	bests[1];	/* best free counts */
+						/* unused entries are -1 */
+} xfs_dir2_free_t;
+#define XFS_DIR2_MAX_FREE_BESTS(mp)	\
+	(((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_free_hdr_t)) / \
+	 (uint)sizeof(xfs_dir2_data_off_t))
+
+/*
+ * Macros.
+ */
+
+/*
+ * Convert data space db to the corresponding free db.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_TO_FDB)
+xfs_dir2_db_t
+xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db);
+#define XFS_DIR2_DB_TO_FDB(mp,db)	xfs_dir2_db_to_fdb(mp, db)
+#else
+#define XFS_DIR2_DB_TO_FDB(mp,db)	\
+	(XFS_DIR2_FREE_FIRSTDB(mp) + (db) / XFS_DIR2_MAX_FREE_BESTS(mp))
+#endif
+
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_DB_TO_FDINDEX)
+int
+xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db);
+#define XFS_DIR2_DB_TO_FDINDEX(mp,db)	xfs_dir2_db_to_fdindex(mp, db)
+#else
+#define XFS_DIR2_DB_TO_FDINDEX(mp,db)	((db) % XFS_DIR2_MAX_FREE_BESTS(mp))
+#endif
+
+/*
+ * Functions.
+ */
+
+extern void
+	xfs_dir2_free_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp,
+				int first, int last);
+
+extern int
+	xfs_dir2_leaf_to_node(struct xfs_da_args *args, struct xfs_dabuf *lbp);
+
+extern xfs_dahash_t
+	xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count);
+
+extern int
+	xfs_dir2_leafn_lookup_int(struct xfs_dabuf *bp,
+				  struct xfs_da_args *args, int *indexp,
+				  struct xfs_da_state *state);
+
+extern int
+	xfs_dir2_leafn_order(struct xfs_dabuf *leaf1_bp,
+			     struct xfs_dabuf *leaf2_bp);
+
+extern int
+	xfs_dir2_leafn_split(struct xfs_da_state *state,
+			     struct xfs_da_state_blk *oldblk,
+			     struct xfs_da_state_blk *newblk);
+
+extern int
+	xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action);
+
+extern void
+	xfs_dir2_leafn_unbalance(struct xfs_da_state *state,
+				 struct xfs_da_state_blk *drop_blk,
+				 struct xfs_da_state_blk *save_blk);
+
+extern int
+	xfs_dir2_node_addname(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_node_lookup(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_node_removename(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_node_replace(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
+				int *rvalp);
+
+#endif	/* __XFS_DIR2_NODE_H__ */
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
new file mode 100644
index 000000000000..c1d2d3d9b2c2
--- /dev/null
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -0,0 +1,1320 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * xfs_dir2_sf.c
+ * Shortform directory implementation for v2 directories.
+ */
+
+#include <xfs.h>
+
+/*
+ * Prototypes for internal functions.
+ */
+static void xfs_dir2_sf_addname_easy(xfs_da_args_t *args,
+				     xfs_dir2_sf_entry_t *sfep,
+				     xfs_dir2_data_aoff_t offset,
+				     int new_isize);
+static void xfs_dir2_sf_addname_hard(xfs_da_args_t *args, int objchange,
+				     int new_isize);
+static int xfs_dir2_sf_addname_pick(xfs_da_args_t *args, int objchange,
+				    xfs_dir2_sf_entry_t **sfepp,
+				    xfs_dir2_data_aoff_t *offsetp);
+#ifdef DEBUG
+static void xfs_dir2_sf_check(xfs_da_args_t *args);
+#else
+#define xfs_dir2_sf_check(args)
+#endif /* DEBUG */
+#if XFS_BIG_FILESYSTEMS
+static void xfs_dir2_sf_toino4(xfs_da_args_t *args);
+static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
+#endif /* XFS_BIG_FILESYSTEMS */
+
+/*
+ * Given a block directory (dp/block), calculate its size as a shortform (sf)
+ * directory and a header for the sf directory, if it will fit it the
+ * space currently present in the inode.  If it won't fit, the output
+ * size is too big (but not accurate).
+ */
+int						/* size for sf form */
+xfs_dir2_block_sfsize(
+	xfs_inode_t		*dp,		/* incore inode pointer */
+	xfs_dir2_block_t	*block,		/* block directory data */
+	xfs_dir2_sf_hdr_t	*sfhp)		/* output: header for sf form */
+{
+	xfs_dir2_dataptr_t	addr;		/* data entry address */
+	xfs_dir2_leaf_entry_t	*blp;		/* leaf area of the block */
+	xfs_dir2_block_tail_t	*btp;		/* tail area of the block */
+	int			count;		/* shortform entry count */
+	xfs_dir2_data_entry_t	*dep;		/* data entry in the block */
+	int			i;		/* block entry index */
+	int			i8count;	/* count of big-inode entries */
+	int			isdot;		/* entry is "." */
+	int			isdotdot;	/* entry is ".." */
+	xfs_mount_t		*mp;		/* mount structure pointer */
+	int			namelen;	/* total name bytes */
+	xfs_ino_t		parent;		/* parent inode number */
+	int			size=0;		/* total computed size */
+
+	mp = dp->i_mount;
+
+	count = i8count = namelen = 0;
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	blp = XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+
+	/*
+	 * Iterate over the block's data entries by using the leaf pointers.
+	 */
+	for (i = 0; i < INT_GET(btp->count, ARCH_CONVERT); i++) {
+		if ((addr = INT_GET(blp[i].address, ARCH_CONVERT)) == XFS_DIR2_NULL_DATAPTR)
+			continue;
+		/*
+		 * Calculate the pointer to the entry at hand.
+		 */
+		dep = (xfs_dir2_data_entry_t *)
+		      ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, addr));
+		/*
+		 * Detect . and .., so we can special-case them.
+		 * . is not included in sf directories.
+		 * .. is included by just the parent inode number.
+		 */
+		isdot = dep->namelen == 1 && dep->name[0] == '.';
+		isdotdot =
+			dep->namelen == 2 &&
+			dep->name[0] == '.' && dep->name[1] == '.';
+#if XFS_BIG_FILESYSTEMS
+		if (!isdot)
+			i8count += INT_GET(dep->inumber, ARCH_CONVERT) > XFS_DIR2_MAX_SHORT_INUM;
+#endif
+		if (!isdot && !isdotdot) {
+			count++;
+			namelen += dep->namelen;
+		} else if (isdotdot)
+			parent = INT_GET(dep->inumber, ARCH_CONVERT);
+		/*
+		 * Calculate the new size, see if we should give up yet.
+		 */
+		size = XFS_DIR2_SF_HDR_SIZE(i8count) +		/* header */
+		       count +					/* namelen */
+		       count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */
+		       namelen +				/* name */
+		       (i8count ?				/* inumber */
+				(uint)sizeof(xfs_dir2_ino8_t) * count :
+				(uint)sizeof(xfs_dir2_ino4_t) * count);
+		if (size > XFS_IFORK_DSIZE(dp))
+			return size;		/* size value is a failure */
+	}
+	/*
+	 * Create the output header, if it worked.
+	 */
+	sfhp->count = count;
+	sfhp->i8count = i8count;
+	XFS_DIR2_SF_PUT_INUMBER_ARCH((xfs_dir2_sf_t *)sfhp, &parent, &sfhp->parent, ARCH_CONVERT);
+	return size;
+}
+
+/*
+ * Convert a block format directory to shortform.
+ * Caller has already checked that it will fit, and built us a header.
+ */
+int						/* error */
+xfs_dir2_block_to_sf(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dabuf_t		*bp,		/* block buffer */
+	int			size,		/* shortform directory size */
+	xfs_dir2_sf_hdr_t	*sfhp)		/* shortform directory hdr */
+{
+	xfs_dir2_block_t	*block;		/* block structure */
+	xfs_dir2_block_tail_t	*btp;		/* block tail pointer */
+	xfs_dir2_data_entry_t	*dep;		/* data entry pointer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_data_unused_t	*dup;		/* unused data pointer */
+	char			*endptr;	/* end of data entries */
+	int			error;		/* error return value */
+	int			logflags;	/* inode logging flags */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	char			*ptr;		/* current data pointer */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform entry */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+	xfs_ino_t		temp;
+
+	xfs_dir2_trace_args_sb("block_to_sf", args, size, bp);
+	dp = args->dp;
+	mp = dp->i_mount;
+
+	/*
+	 * Make a copy of the block data, so we can shrink the inode
+	 * and add local data.
+	 */
+	block = kmem_alloc(mp->m_dirblksize, KM_SLEEP);
+	bcopy(bp->data, block, mp->m_dirblksize);
+	logflags = XFS_ILOG_CORE;
+	if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) {
+		ASSERT(error != ENOSPC);
+		goto out;
+	}
+	/*
+	 * The buffer is now unconditionally gone, whether
+	 * xfs_dir2_shrink_inode worked or not.
+	 *
+	 * Convert the inode to local format.
+	 */
+	dp->i_df.if_flags &= ~XFS_IFEXTENTS;
+	dp->i_df.if_flags |= XFS_IFINLINE;
+	dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+	ASSERT(dp->i_df.if_bytes == 0);
+	xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+	logflags |= XFS_ILOG_DDATA;
+	/*
+	 * Copy the header into the newly allocate local space.
+	 */
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	bcopy(sfhp, sfp, XFS_DIR2_SF_HDR_SIZE(sfhp->i8count));
+	dp->i_d.di_size = size;
+	/*
+	 * Set up to loop over the block's entries.
+	 */
+	btp = XFS_DIR2_BLOCK_TAIL_P(mp, block);
+	ptr = (char *)block->u;
+	endptr = (char *)XFS_DIR2_BLOCK_LEAF_P_ARCH(btp, ARCH_CONVERT);
+	sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+	/*
+	 * Loop over the active and unused entries.
+	 * Stop when we reach the leaf/tail portion of the block.
+	 */
+	while (ptr < endptr) {
+		/*
+		 * If it's unused, just skip over it.
+		 */
+		dup = (xfs_dir2_data_unused_t *)ptr;
+		if (INT_GET(dup->freetag, ARCH_CONVERT) == XFS_DIR2_DATA_FREE_TAG) {
+			ptr += INT_GET(dup->length, ARCH_CONVERT);
+			continue;
+		}
+		dep = (xfs_dir2_data_entry_t *)ptr;
+		/*
+		 * Skip .
+		 */
+		if (dep->namelen == 1 && dep->name[0] == '.')
+			ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) == dp->i_ino);
+		/*
+		 * Skip .., but make sure the inode number is right.
+		 */
+		else if (dep->namelen == 2 &&
+			 dep->name[0] == '.' && dep->name[1] == '.')
+			ASSERT(INT_GET(dep->inumber, ARCH_CONVERT) ==
+			       XFS_DIR2_SF_GET_INUMBER_ARCH(sfp, &sfp->hdr.parent, ARCH_CONVERT));
+		/*
+		 * Normal entry, copy it into shortform.
+		 */
+		else {
+			sfep->namelen = dep->namelen;
+			XFS_DIR2_SF_PUT_OFFSET_ARCH(sfep,
+				(xfs_dir2_data_aoff_t)
+				((char *)dep - (char *)block), ARCH_CONVERT);
+			bcopy(dep->name, sfep->name, dep->namelen);
+			temp=INT_GET(dep->inumber, ARCH_CONVERT);
+			XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp, &temp,
+				XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT);
+			sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
+		}
+		ptr += XFS_DIR2_DATA_ENTSIZE(dep->namelen);
+	}
+	ASSERT((char *)sfep - (char *)sfp == size);
+	xfs_dir2_sf_check(args);
+out:
+	xfs_trans_log_inode(args->trans, dp, logflags);
+	kmem_free(block, mp->m_dirblksize);
+	return error;
+}
+
+/*
+ * Add a name to a shortform directory.
+ * There are two algorithms, "easy" and "hard" which we decide on
+ * before changing anything.
+ * Convert to block form if necessary, if the new entry won't fit.
+ */
+int						/* error */
+xfs_dir2_sf_addname(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	int			add_entsize;	/* size of the new entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			error;		/* error return value */
+	int			incr_isize;	/* total change in size */
+	int			new_isize;	/* di_size after adding name */
+	int			objchange;	/* changing to 8-byte inodes */
+	xfs_dir2_data_aoff_t	offset;		/* offset for new entry */
+	int			old_isize;	/* di_size before adding name */
+	int			pick;		/* which algorithm to use */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform entry */
+
+	xfs_dir2_trace_args("sf_addname", args);
+	ASSERT(xfs_dir2_sf_lookup(args) == ENOENT);
+	dp = args->dp;
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Make sure the shortform value has some of its header.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
+	/*
+	 * Compute entry (and change in) size.
+	 */
+	add_entsize = XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen);
+	incr_isize = add_entsize;
+#if XFS_BIG_FILESYSTEMS
+	/*
+	 * Do we have to change to 8 byte inodes?
+	 */
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->hdr.i8count == 0) {
+		/*
+		 * Yes, adjust the entry size and the total size.
+		 */
+		add_entsize +=
+			(uint)sizeof(xfs_dir2_ino8_t) -
+			(uint)sizeof(xfs_dir2_ino4_t);
+		incr_isize +=
+			(sfp->hdr.count + 2) *
+			((uint)sizeof(xfs_dir2_ino8_t) -
+			 (uint)sizeof(xfs_dir2_ino4_t));
+		objchange = 1;
+	} else
+		objchange = 0;
+#else
+	objchange = 0;
+#endif
+	old_isize = (int)dp->i_d.di_size;
+	new_isize = old_isize + incr_isize;
+	/*
+	 * Won't fit as shortform any more (due to size),
+	 * or the pick routine says it won't (due to offset values).
+	 */
+	if (new_isize > XFS_IFORK_DSIZE(dp) ||
+	    (pick =
+	     xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) {
+		/*
+		 * Just checking or no space reservation, it doesn't fit.
+		 */
+		if (args->justcheck || args->total == 0)
+			return XFS_ERROR(ENOSPC);
+		/*
+		 * Convert to block form then add the name.
+		 */
+		error = xfs_dir2_sf_to_block(args);
+		if (error)
+			return error;
+		return xfs_dir2_block_addname(args);
+	}
+	/*
+	 * Just checking, it fits.
+	 */
+	if (args->justcheck)
+		return 0;
+	/*
+	 * Do it the easy way - just add it at the end.
+	 */
+	if (pick == 1)
+		xfs_dir2_sf_addname_easy(args, sfep, offset, new_isize);
+	/*
+	 * Do it the hard way - look for a place to insert the new entry.
+	 * Convert to 8 byte inode numbers first if necessary.
+	 */
+	else {
+		ASSERT(pick == 2);
+#if XFS_BIG_FILESYSTEMS
+		if (objchange)
+			xfs_dir2_sf_toino8(args);
+#endif
+		xfs_dir2_sf_addname_hard(args, objchange, new_isize);
+	}
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+	return 0;
+}
+
+/*
+ * Add the new entry the "easy" way.
+ * This is copying the old directory and adding the new entry at the end.
+ * Since it's sorted by "offset" we need room after the last offset
+ * that's already there, and then room to convert to a block directory.
+ * This is already checked by the pick routine.
+ */
+static void
+xfs_dir2_sf_addname_easy(
+	xfs_da_args_t		*args,		/* operation arguments */
+	xfs_dir2_sf_entry_t	*sfep,		/* pointer to new entry */
+	xfs_dir2_data_aoff_t	offset,		/* offset to use for new ent */
+	int			new_isize)	/* new directory size */
+{
+	int			byteoff;	/* byte offset in sf dir */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+
+	dp = args->dp;
+
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	byteoff = (int)((char *)sfep - (char *)sfp);
+	/*
+	 * Grow the in-inode space.
+	 */
+	xfs_idata_realloc(dp, XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen),
+		XFS_DATA_FORK);
+	/*
+	 * Need to set up again due to realloc of the inode data.
+	 */
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff);
+	/*
+	 * Fill in the new entry.
+	 */
+	sfep->namelen = args->namelen;
+	XFS_DIR2_SF_PUT_OFFSET_ARCH(sfep, offset, ARCH_CONVERT);
+	bcopy(args->name, sfep->name, sfep->namelen);
+	XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp, &args->inumber,
+		XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT);
+	/*
+	 * Update the header and inode.
+	 */
+	sfp->hdr.count++;
+#if XFS_BIG_FILESYSTEMS
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM)
+		sfp->hdr.i8count++;
+#endif
+	dp->i_d.di_size = new_isize;
+	xfs_dir2_sf_check(args);
+}
+
+/*
+ * Add the new entry the "hard" way.
+ * The caller has already converted to 8 byte inode numbers if necessary,
+ * in which case we need to leave the i8count at 1.
+ * Find a hole that the new entry will fit into, and copy
+ * the first part of the entries, the new entry, and the last part of
+ * the entries.
+ */
+/* ARGSUSED */
+static void
+xfs_dir2_sf_addname_hard(
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			objchange,	/* changing inode number size */
+	int			new_isize)	/* new directory size */
+{
+	int			add_datasize;	/* data size need for new ent */
+	char			*buf;		/* buffer for old */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			eof;		/* reached end of old dir */
+	int			nbytes;		/* temp for byte copies */
+	xfs_dir2_data_aoff_t	new_offset;	/* next offset value */
+	xfs_dir2_data_aoff_t	offset;		/* current offset value */
+	int			old_isize;	/* previous di_size */
+	xfs_dir2_sf_entry_t	*oldsfep;	/* entry in original dir */
+	xfs_dir2_sf_t		*oldsfp;	/* original shortform dir */
+	xfs_dir2_sf_entry_t	*sfep;		/* entry in new dir */
+	xfs_dir2_sf_t		*sfp;		/* new shortform dir */
+
+	/*
+	 * Copy the old directory to the stack buffer.
+	 */
+	dp = args->dp;
+
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	old_isize = (int)dp->i_d.di_size;
+	buf = kmem_alloc(old_isize, KM_SLEEP);
+	oldsfp = (xfs_dir2_sf_t *)buf;
+	bcopy(sfp, oldsfp, old_isize);
+	/*
+	 * Loop over the old directory finding the place we're going
+	 * to insert the new entry.
+	 * If it's going to end up at the end then oldsfep will point there.
+	 */
+	for (offset = XFS_DIR2_DATA_FIRST_OFFSET,
+	      oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp),
+	      add_datasize = XFS_DIR2_DATA_ENTSIZE(args->namelen),
+	      eof = (char *)oldsfep == &buf[old_isize];
+	     !eof;
+	     offset = new_offset + XFS_DIR2_DATA_ENTSIZE(oldsfep->namelen),
+	      oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep),
+	      eof = (char *)oldsfep == &buf[old_isize]) {
+		new_offset = XFS_DIR2_SF_GET_OFFSET_ARCH(oldsfep, ARCH_CONVERT);
+		if (offset + add_datasize <= new_offset)
+			break;
+	}
+	/*
+	 * Get rid of the old directory, then allocate space for
+	 * the new one.	 We do this so xfs_idata_realloc won't copy
+	 * the data.
+	 */
+	xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK);
+	xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK);
+	/*
+	 * Reset the pointer since the buffer was reallocated.
+	 */
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	/*
+	 * Copy the first part of the directory, including the header.
+	 */
+	nbytes = (int)((char *)oldsfep - (char *)oldsfp);
+	bcopy(oldsfp, sfp, nbytes);
+	sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + nbytes);
+	/*
+	 * Fill in the new entry, and update the header counts.
+	 */
+	sfep->namelen = args->namelen;
+	XFS_DIR2_SF_PUT_OFFSET_ARCH(sfep, offset, ARCH_CONVERT);
+	bcopy(args->name, sfep->name, sfep->namelen);
+	XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp, &args->inumber,
+		XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT);
+	sfp->hdr.count++;
+#if XFS_BIG_FILESYSTEMS
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
+		sfp->hdr.i8count++;
+#endif
+	/*
+	 * If there's more left to copy, do that.
+	 */
+	if (!eof) {
+		sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
+		bcopy(oldsfep, sfep, old_isize - nbytes);
+	}
+	kmem_free(buf, old_isize);
+	dp->i_d.di_size = new_isize;
+	xfs_dir2_sf_check(args);
+}
+
+/*
+ * Decide if the new entry will fit at all.
+ * If it will fit, pick between adding the new entry to the end (easy)
+ * or somewhere else (hard).
+ * Return 0 (won't fit), 1 (easy), 2 (hard).
+ */
+/*ARGSUSED*/
+static int					/* pick result */
+xfs_dir2_sf_addname_pick(
+	xfs_da_args_t		*args,		/* operation arguments */
+	int			objchange,	/* inode # size changes */
+	xfs_dir2_sf_entry_t	**sfepp,	/* out(1): new entry ptr */
+	xfs_dir2_data_aoff_t	*offsetp)	/* out(1): new offset */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			holefit;	/* found hole it will fit in */
+	int			i;		/* entry number */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_data_aoff_t	offset;		/* data block offset */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform entry */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+	int			size;		/* entry's data size */
+	int			used;		/* data bytes used */
+
+	dp = args->dp;
+	mp = dp->i_mount;
+
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	size = XFS_DIR2_DATA_ENTSIZE(args->namelen);
+	offset = XFS_DIR2_DATA_FIRST_OFFSET;
+	sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+	holefit = 0;
+	/*
+	 * Loop over sf entries.
+	 * Keep track of data offset and whether we've seen a place
+	 * to insert the new entry.
+	 */
+	for (i = 0; i < sfp->hdr.count; i++) {
+		if (!holefit)
+			holefit = offset + size <= XFS_DIR2_SF_GET_OFFSET_ARCH(sfep, ARCH_CONVERT);
+		offset = XFS_DIR2_SF_GET_OFFSET_ARCH(sfep, ARCH_CONVERT) +
+			 XFS_DIR2_DATA_ENTSIZE(sfep->namelen);
+		sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
+	}
+	/*
+	 * Calculate data bytes used excluding the new entry, if this
+	 * was a data block (block form directory).
+	 */
+	used = offset +
+	       (sfp->hdr.count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+	       (uint)sizeof(xfs_dir2_block_tail_t);
+	/*
+	 * If it won't fit in a block form then we can't insert it,
+	 * we'll go back, convert to block, then try the insert and convert
+	 * to leaf.
+	 */
+	if (used + (holefit ? 0 : size) > mp->m_dirblksize)
+		return 0;
+	/*
+	 * If changing the inode number size, do it the hard way.
+	 */
+#if XFS_BIG_FILESYSTEMS
+	if (objchange) {
+		return 2;
+	}
+#else
+	ASSERT(objchange == 0);
+#endif
+	/*
+	 * If it won't fit at the end then do it the hard way (use the hole).
+	 */
+	if (used + size > mp->m_dirblksize)
+		return 2;
+	/*
+	 * Do it the easy way.
+	 */
+	*sfepp = sfep;
+	*offsetp = offset;
+	return 1;
+}
+
+#ifdef DEBUG
+/*
+ * Check consistency of shortform directory, assert if bad.
+ */
+static void
+xfs_dir2_sf_check(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry number */
+	int			i8count;	/* number of big inode#s */
+	xfs_ino_t		ino;		/* entry inode number */
+	int			offset;		/* data offset */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform dir entry */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+
+	dp = args->dp;
+
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	offset = XFS_DIR2_DATA_FIRST_OFFSET;
+	ino = XFS_DIR2_SF_GET_INUMBER_ARCH(sfp, &sfp->hdr.parent, ARCH_CONVERT);
+	i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
+
+	for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+	     i < sfp->hdr.count;
+	     i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
+		ASSERT(XFS_DIR2_SF_GET_OFFSET_ARCH(sfep, ARCH_CONVERT) >= offset);
+		ino = XFS_DIR2_SF_GET_INUMBER_ARCH(sfp, XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT);
+		i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
+		offset =
+			XFS_DIR2_SF_GET_OFFSET_ARCH(sfep, ARCH_CONVERT) +
+			XFS_DIR2_DATA_ENTSIZE(sfep->namelen);
+	}
+	ASSERT(i8count == sfp->hdr.i8count);
+#if !XFS_BIG_FILESYSTEMS
+	ASSERT(i8count == 0);
+#endif
+	ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
+	ASSERT(offset +
+	       (sfp->hdr.count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+	       (uint)sizeof(xfs_dir2_block_tail_t) <=
+	       dp->i_mount->m_dirblksize);
+}
+#endif	/* DEBUG */
+
+/*
+ * Create a new (shortform) directory.
+ */
+int					/* error, always 0 */
+xfs_dir2_sf_create(
+	xfs_da_args_t	*args,		/* operation arguments */
+	xfs_ino_t	pino)		/* parent inode number */
+{
+	xfs_inode_t	*dp;		/* incore directory inode */
+	int		i8count;	/* parent inode is an 8-byte number */
+	xfs_dir2_sf_t	*sfp;		/* shortform structure */
+	int		size;		/* directory size */
+
+	xfs_dir2_trace_args_i("sf_create", args, pino);
+	dp = args->dp;
+
+	ASSERT(dp != NULL);
+	ASSERT(dp->i_d.di_size == 0);
+	/*
+	 * If it's currently a zero-length extent file,
+	 * convert it to local format.
+	 */
+	if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) {
+		dp->i_df.if_flags &= ~XFS_IFEXTENTS;	/* just in case */
+		dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+		xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+		dp->i_df.if_flags |= XFS_IFINLINE;
+	}
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	ASSERT(dp->i_df.if_bytes == 0);
+	i8count = pino > XFS_DIR2_MAX_SHORT_INUM;
+	size = XFS_DIR2_SF_HDR_SIZE(i8count);
+	/*
+	 * Make a buffer for the data.
+	 */
+	xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+	/*
+	 * Fill in the header,
+	 */
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	sfp->hdr.i8count = i8count;
+	/*
+	 * Now can put in the inode number, since i8count is set.
+	 */
+	XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp, &pino, &sfp->hdr.parent, ARCH_CONVERT);
+	sfp->hdr.count = 0;
+	dp->i_d.di_size = size;
+	xfs_dir2_sf_check(args);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+	return 0;
+}
+
+int						/* error */
+xfs_dir2_sf_getdents(
+	xfs_inode_t		*dp,		/* incore directory inode */
+	uio_t			*uio,		/* caller's buffer control */
+	int			*eofp,		/* eof reached? (out) */
+	xfs_dirent_t		*dbp,		/* caller's buffer */
+	xfs_dir2_put_t		put)		/* abi's formatting function */
+{
+	int			error;		/* error return value */
+	int			i;		/* shortform entry number */
+	xfs_mount_t		*mp;		/* filesystem mount point */
+	xfs_dir2_dataptr_t	off;		/* current entry's offset */
+	xfs_dir2_put_args_t	p;		/* arg package for put rtn */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+	xfs_off_t			dir_offset;
+
+	mp = dp->i_mount;
+
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Give up if the directory is way too short.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		return XFS_ERROR(EIO);
+	}
+
+	dir_offset = uio->uio_offset;
+
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+
+	ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
+
+	/*
+	 * If the block number in the offset is out of range, we're done.
+	 */
+	if (XFS_DIR2_DATAPTR_TO_DB(mp, dir_offset) > mp->m_dirdatablk) {
+		*eofp = 1;
+		return 0;
+	}
+
+	/*
+	 * Set up putargs structure.
+	 */
+	p.dbp = dbp;
+	p.put = put;
+	p.uio = uio;
+	/*
+	 * Put . entry unless we're starting past it.
+	 */
+	if (dir_offset <=
+		    XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+					       XFS_DIR2_DATA_DOT_OFFSET)) {
+		/*
+		 * NOTE! Linux "filldir" semantics require that the
+		 *	 offset "cookie" be for this entry, not the
+		 *	 next; all the actual shuffling to make it
+		 *	 "look right" to the user is done in filldir.
+		 */
+		p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, 0,
+						XFS_DIR2_DATA_DOT_OFFSET);
+#if XFS_BIG_FILESYSTEMS
+		p.ino = dp->i_ino + mp->m_inoadd;
+#else
+		p.ino = dp->i_ino;
+#endif
+		p.name = ".";
+		p.namelen = 1;
+
+		error = p.put(&p);
+
+		if (!p.done) {
+			uio->uio_offset =
+				XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+						XFS_DIR2_DATA_DOT_OFFSET);
+			return error;
+		}
+	}
+
+	/*
+	 * Put .. entry unless we're starting past it.
+	 */
+	if (dir_offset <=
+		    XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+					       XFS_DIR2_DATA_DOTDOT_OFFSET)) {
+		/*
+		 * NOTE! Linux "filldir" semantics require that the
+		 *	 offset "cookie" be for this entry, not the
+		 *	 next; all the actual shuffling to make it
+		 *	 "look right" to the user is done in filldir.
+		 */
+		p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+						XFS_DIR2_DATA_DOTDOT_OFFSET);
+#if XFS_BIG_FILESYSTEMS
+		p.ino = XFS_DIR2_SF_GET_INUMBER_ARCH(sfp, &sfp->hdr.parent, ARCH_CONVERT) +
+			mp->m_inoadd;
+#else
+		p.ino = XFS_DIR2_SF_GET_INUMBER_ARCH(sfp, &sfp->hdr.parent, ARCH_CONVERT);
+#endif
+		p.name = "..";
+		p.namelen = 2;
+
+		error = p.put(&p);
+
+		if (!p.done) {
+			uio->uio_offset =
+				XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+					XFS_DIR2_DATA_DOTDOT_OFFSET);
+			return error;
+		}
+	}
+
+	/*
+	 * Loop while there are more entries and put'ing works.
+	 */
+	for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+		     i < sfp->hdr.count;
+			     i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
+
+		off = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+				XFS_DIR2_SF_GET_OFFSET_ARCH(sfep, ARCH_CONVERT));
+
+		if (dir_offset > off)
+			continue;
+
+		p.namelen = sfep->namelen;
+
+		/*
+		 * NOTE! Linux "filldir" semantics require that the
+		 *	 offset "cookie" be for this entry, not the
+		 *	 next; all the actual shuffling to make it
+		 *	 "look right" to the user is done in filldir.
+		 */
+		p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk,
+				XFS_DIR2_SF_GET_OFFSET_ARCH(sfep, ARCH_CONVERT));
+
+#if XFS_BIG_FILESYSTEMS
+		p.ino = XFS_DIR2_SF_GET_INUMBER_ARCH(sfp,
+				XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT) +
+			mp->m_inoadd;
+#else
+		p.ino = XFS_DIR2_SF_GET_INUMBER_ARCH(sfp,
+				XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT);
+#endif
+		p.name = (char *)sfep->name;
+
+		error = p.put(&p);
+
+		if (!p.done) {
+			uio->uio_offset = off;
+			return error;
+		}
+	}
+
+	/*
+	 * They all fit.
+	 */
+	*eofp = 1;
+
+	uio->uio_offset =
+		XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk + 1, 0);
+
+	return 0;
+}
+
+/*
+ * Lookup an entry in a shortform directory.
+ * Returns EEXIST if found, ENOENT if not found.
+ */
+int						/* error */
+xfs_dir2_sf_lookup(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry index */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+
+	xfs_dir2_trace_args("sf_lookup", args);
+	xfs_dir2_sf_check(args);
+	dp = args->dp;
+
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Bail out if the directory is way too short.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
+	/*
+	 * Special case for .
+	 */
+	if (args->namelen == 1 && args->name[0] == '.') {
+		args->inumber = dp->i_ino;
+		return XFS_ERROR(EEXIST);
+	}
+	/*
+	 * Special case for ..
+	 */
+	if (args->namelen == 2 &&
+	    args->name[0] == '.' && args->name[1] == '.') {
+		args->inumber = XFS_DIR2_SF_GET_INUMBER_ARCH(sfp, &sfp->hdr.parent, ARCH_CONVERT);
+		return XFS_ERROR(EEXIST);
+	}
+	/*
+	 * Loop over all the entries trying to match ours.
+	 */
+	for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+	     i < sfp->hdr.count;
+	     i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
+		if (sfep->namelen == args->namelen &&
+		    sfep->name[0] == args->name[0] &&
+		    bcmp(args->name, sfep->name, args->namelen) == 0) {
+			args->inumber =
+				XFS_DIR2_SF_GET_INUMBER_ARCH(sfp,
+					XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT);
+			return XFS_ERROR(EEXIST);
+		}
+	}
+	/*
+	 * Didn't find it.
+	 */
+	ASSERT(args->oknoent);
+	return XFS_ERROR(ENOENT);
+}
+
+/*
+ * Remove an entry from a shortform directory.
+ */
+int						/* error */
+xfs_dir2_sf_removename(
+	xfs_da_args_t		*args)
+{
+	int			byteoff;	/* offset of removed entry */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			entsize;	/* this entry's size */
+	int			i;		/* shortform entry index */
+	int			newsize;	/* new inode size */
+	int			oldsize;	/* old inode size */
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+
+	xfs_dir2_trace_args("sf_removename", args);
+	dp = args->dp;
+
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	oldsize = (int)dp->i_d.di_size;
+	/*
+	 * Bail out if the directory is way too short.
+	 */
+	if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == oldsize);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	ASSERT(oldsize >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
+	/*
+	 * Loop over the old directory entries.
+	 * Find the one we're deleting.
+	 */
+	for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+	     i < sfp->hdr.count;
+	     i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
+		if (sfep->namelen == args->namelen &&
+		    sfep->name[0] == args->name[0] &&
+		    bcmp(sfep->name, args->name, args->namelen) == 0) {
+			ASSERT(XFS_DIR2_SF_GET_INUMBER_ARCH(sfp,
+					XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT) ==
+				args->inumber);
+			break;
+		}
+	}
+	/*
+	 * Didn't find it.
+	 */
+	if (i == sfp->hdr.count) {
+		return XFS_ERROR(ENOENT);
+	}
+	/*
+	 * Calculate sizes.
+	 */
+	byteoff = (int)((char *)sfep - (char *)sfp);
+	entsize = XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen);
+	newsize = oldsize - entsize;
+	/*
+	 * Copy the part if any after the removed entry, sliding it down.
+	 */
+	if (byteoff + entsize < oldsize)
+		ovbcopy((char *)sfp + byteoff + entsize, (char *)sfp + byteoff,
+			oldsize - (byteoff + entsize));
+	/*
+	 * Fix up the header and file size.
+	 */
+	sfp->hdr.count--;
+	dp->i_d.di_size = newsize;
+	/*
+	 * Reallocate, making it smaller.
+	 */
+	xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+#if XFS_BIG_FILESYSTEMS
+	/*
+	 * Are we changing inode number size?
+	 */
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
+		if (sfp->hdr.i8count == 1)
+			xfs_dir2_sf_toino4(args);
+		else
+			sfp->hdr.i8count--;
+	}
+#endif
+	xfs_dir2_sf_check(args);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+	return 0;
+}
+
+/*
+ * Replace the inode number of an entry in a shortform directory.
+ */
+int						/* error */
+xfs_dir2_sf_replace(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry index */
+#if XFS_BIG_FILESYSTEMS || defined(DEBUG)
+	xfs_ino_t		ino=0;		/* entry old inode number */
+#endif
+#if XFS_BIG_FILESYSTEMS
+	int			i8elevated;	/* sf_toino8 set i8count=1 */
+#endif
+	xfs_dir2_sf_entry_t	*sfep;		/* shortform directory entry */
+	xfs_dir2_sf_t		*sfp;		/* shortform structure */
+
+	xfs_dir2_trace_args("sf_replace", args);
+	dp = args->dp;
+
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Bail out if the shortform directory is way too small.
+	 */
+	if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count));
+#if XFS_BIG_FILESYSTEMS
+	/*
+	 * New inode number is large, and need to convert to 8-byte inodes.
+	 */
+	if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->hdr.i8count == 0) {
+		int	error;			/* error return value */
+		int	newsize;		/* new inode size */
+
+		newsize =
+			dp->i_df.if_bytes +
+			(sfp->hdr.count + 1) *
+			((uint)sizeof(xfs_dir2_ino8_t) -
+			 (uint)sizeof(xfs_dir2_ino4_t));
+		/*
+		 * Won't fit as shortform, convert to block then do replace.
+		 */
+		if (newsize > XFS_IFORK_DSIZE(dp)) {
+			error = xfs_dir2_sf_to_block(args);
+			if (error) {
+				return error;
+			}
+			return xfs_dir2_block_replace(args);
+		}
+		/*
+		 * Still fits, convert to 8-byte now.
+		 */
+		xfs_dir2_sf_toino8(args);
+		i8elevated = 1;
+		sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	} else
+		i8elevated = 0;
+#endif
+	ASSERT(args->namelen != 1 || args->name[0] != '.');
+	/*
+	 * Replace ..'s entry.
+	 */
+	if (args->namelen == 2 &&
+	    args->name[0] == '.' && args->name[1] == '.') {
+#if XFS_BIG_FILESYSTEMS || defined(DEBUG)
+		ino = XFS_DIR2_SF_GET_INUMBER_ARCH(sfp, &sfp->hdr.parent, ARCH_CONVERT);
+		ASSERT(args->inumber != ino);
+#endif
+		XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp, &args->inumber, &sfp->hdr.parent, ARCH_CONVERT);
+	}
+	/*
+	 * Normal entry, look for the name.
+	 */
+	else {
+		for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp);
+		     i < sfp->hdr.count;
+		     i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) {
+			if (sfep->namelen == args->namelen &&
+			    sfep->name[0] == args->name[0] &&
+			    bcmp(args->name, sfep->name, args->namelen) == 0) {
+#if XFS_BIG_FILESYSTEMS || defined(DEBUG)
+				ino = XFS_DIR2_SF_GET_INUMBER_ARCH(sfp,
+					XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT);
+				ASSERT(args->inumber != ino);
+#endif
+				XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp, &args->inumber,
+					XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT);
+				break;
+			}
+		}
+		/*
+		 * Didn't find it.
+		 */
+		if (i == sfp->hdr.count) {
+			ASSERT(args->oknoent);
+#if XFS_BIG_FILESYSTEMS
+			if (i8elevated)
+				xfs_dir2_sf_toino4(args);
+#endif
+			return XFS_ERROR(ENOENT);
+		}
+	}
+#if XFS_BIG_FILESYSTEMS
+	/*
+	 * See if the old number was large, the new number is small.
+	 */
+	if (ino > XFS_DIR2_MAX_SHORT_INUM &&
+	    args->inumber <= XFS_DIR2_MAX_SHORT_INUM) {
+		/*
+		 * And the old count was one, so need to convert to small.
+		 */
+		if (sfp->hdr.i8count == 1)
+			xfs_dir2_sf_toino4(args);
+		else
+			sfp->hdr.i8count--;
+	}
+	/*
+	 * See if the old number was small, the new number is large.
+	 */
+	if (ino <= XFS_DIR2_MAX_SHORT_INUM &&
+	    args->inumber > XFS_DIR2_MAX_SHORT_INUM) {
+		/*
+		 * add to the i8count unless we just converted to 8-byte
+		 * inodes (which does an implied i8count = 1)
+		 */
+		ASSERT(sfp->hdr.i8count != 0);
+		if (!i8elevated)
+			sfp->hdr.i8count++;
+	}
+#endif
+	xfs_dir2_sf_check(args);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
+	return 0;
+}
+
+#if XFS_BIG_FILESYSTEMS
+/*
+ * Convert from 8-byte inode numbers to 4-byte inode numbers.
+ * The last 8-byte inode number is gone, but the count is still 1.
+ */
+static void
+xfs_dir2_sf_toino4(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	char			*buf;		/* old dir's buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry index */
+	xfs_ino_t		ino;		/* entry inode number */
+	int			newsize;	/* new inode size */
+	xfs_dir2_sf_entry_t	*oldsfep;	/* old sf entry */
+	xfs_dir2_sf_t		*oldsfp;	/* old sf directory */
+	int			oldsize;	/* old inode size */
+	xfs_dir2_sf_entry_t	*sfep;		/* new sf entry */
+	xfs_dir2_sf_t		*sfp;		/* new sf directory */
+
+	xfs_dir2_trace_args("sf_toino4", args);
+	dp = args->dp;
+
+	/*
+	 * Copy the old directory to the buffer.
+	 * Then nuke it from the inode, and add the new buffer to the inode.
+	 * Don't want xfs_idata_realloc copying the data here.
+	 */
+	oldsize = dp->i_df.if_bytes;
+	buf = kmem_alloc(oldsize, KM_SLEEP);
+	oldsfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	ASSERT(oldsfp->hdr.i8count == 1);
+	bcopy(oldsfp, buf, oldsize);
+	/*
+	 * Compute the new inode size.
+	 */
+	newsize =
+		oldsize -
+		(oldsfp->hdr.count + 1) *
+		((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+	xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
+	xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
+	/*
+	 * Reset our pointers, the data has moved.
+	 */
+	oldsfp = (xfs_dir2_sf_t *)buf;
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	/*
+	 * Fill in the new header.
+	 */
+	sfp->hdr.count = oldsfp->hdr.count;
+	sfp->hdr.i8count = 0;
+	ino = XFS_DIR2_SF_GET_INUMBER_ARCH(oldsfp, &oldsfp->hdr.parent, ARCH_CONVERT);
+	XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp, &ino, &sfp->hdr.parent, ARCH_CONVERT);
+	/*
+	 * Copy the entries field by field.
+	 */
+	for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp),
+		    oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp);
+	     i < sfp->hdr.count;
+	     i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep),
+		  oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep)) {
+		sfep->namelen = oldsfep->namelen;
+		sfep->offset = oldsfep->offset;
+		bcopy(oldsfep->name, sfep->name, sfep->namelen);
+		ino = XFS_DIR2_SF_GET_INUMBER_ARCH(oldsfp,
+			XFS_DIR2_SF_INUMBERP(oldsfep), ARCH_CONVERT);
+		XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp, &ino, XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT);
+	}
+	/*
+	 * Clean up the inode.
+	 */
+	kmem_free(buf, oldsize);
+	dp->i_d.di_size = newsize;
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+}
+
+/*
+ * Convert from 4-byte inode numbers to 8-byte inode numbers.
+ * The new 8-byte inode number is not there yet, we leave with the
+ * count 1 but no corresponding entry.
+ */
+static void
+xfs_dir2_sf_toino8(
+	xfs_da_args_t		*args)		/* operation arguments */
+{
+	char			*buf;		/* old dir's buffer */
+	xfs_inode_t		*dp;		/* incore directory inode */
+	int			i;		/* entry index */
+	xfs_ino_t		ino;		/* entry inode number */
+	int			newsize;	/* new inode size */
+	xfs_dir2_sf_entry_t	*oldsfep;	/* old sf entry */
+	xfs_dir2_sf_t		*oldsfp;	/* old sf directory */
+	int			oldsize;	/* old inode size */
+	xfs_dir2_sf_entry_t	*sfep;		/* new sf entry */
+	xfs_dir2_sf_t		*sfp;		/* new sf directory */
+
+	xfs_dir2_trace_args("sf_toino8", args);
+	dp = args->dp;
+
+	/*
+	 * Copy the old directory to the buffer.
+	 * Then nuke it from the inode, and add the new buffer to the inode.
+	 * Don't want xfs_idata_realloc copying the data here.
+	 */
+	oldsize = dp->i_df.if_bytes;
+	buf = kmem_alloc(oldsize, KM_SLEEP);
+	oldsfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	ASSERT(oldsfp->hdr.i8count == 0);
+	bcopy(oldsfp, buf, oldsize);
+	/*
+	 * Compute the new inode size.
+	 */
+	newsize =
+		oldsize +
+		(oldsfp->hdr.count + 1) *
+		((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
+	xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
+	xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
+	/*
+	 * Reset our pointers, the data has moved.
+	 */
+	oldsfp = (xfs_dir2_sf_t *)buf;
+	sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data;
+	/*
+	 * Fill in the new header.
+	 */
+	sfp->hdr.count = oldsfp->hdr.count;
+	sfp->hdr.i8count = 1;
+	ino = XFS_DIR2_SF_GET_INUMBER_ARCH(oldsfp, &oldsfp->hdr.parent, ARCH_CONVERT);
+	XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp, &ino, &sfp->hdr.parent, ARCH_CONVERT);
+	/*
+	 * Copy the entries field by field.
+	 */
+	for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp),
+		    oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp);
+	     i < sfp->hdr.count;
+	     i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep),
+		  oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep)) {
+		sfep->namelen = oldsfep->namelen;
+		sfep->offset = oldsfep->offset;
+		bcopy(oldsfep->name, sfep->name, sfep->namelen);
+		ino = XFS_DIR2_SF_GET_INUMBER_ARCH(oldsfp,
+			XFS_DIR2_SF_INUMBERP(oldsfep), ARCH_CONVERT);
+		XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp, &ino, XFS_DIR2_SF_INUMBERP(sfep), ARCH_CONVERT);
+	}
+	/*
+	 * Clean up the inode.
+	 */
+	kmem_free(buf, oldsize);
+	dp->i_d.di_size = newsize;
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+}
+#endif	/* XFS_BIG_FILESYSTEMS */
diff --git a/fs/xfs/xfs_dir2_sf.h b/fs/xfs/xfs_dir2_sf.h
new file mode 100644
index 000000000000..576c19f17c68
--- /dev/null
+++ b/fs/xfs/xfs_dir2_sf.h
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR2_SF_H__
+#define __XFS_DIR2_SF_H__
+
+/*
+ * Directory layout when stored internal to an inode.
+ *
+ * Small directories are packed as tightly as possible so as to
+ * fit into the literal area of the inode.
+ */
+
+struct dirent;
+struct uio;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_dir2_block;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Maximum size of a shortform directory.
+ */
+#define XFS_DIR2_SF_MAX_SIZE	\
+	(XFS_DINODE_MAX_SIZE - (uint)sizeof(xfs_dinode_core_t) - \
+	 (uint)sizeof(xfs_agino_t))
+
+/*
+ * Inode number stored as 8 8-bit values.
+ */
+typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
+
+#define XFS_DIR2_SF_GET_INO8_ARCH(di,arch)	\
+	(xfs_ino_t)(DIRINO_GET_ARCH(&di,arch))
+#define XFS_DIR2_SF_GET_INO8(di)		\
+	XFS_DIR2_SF_GET_INO8_ARCH(di,ARCH_NOCONVERT)
+
+/*
+ * Inode number stored as 4 8-bit values.
+ * Works a lot of the time, when all the inode numbers in a directory
+ * fit in 32 bits.
+ */
+typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
+#define XFS_DIR2_SF_GET_INO4_ARCH(di,arch)	\
+	(xfs_ino_t)(DIRINO4_GET_ARCH(&di,arch))
+#define XFS_DIR2_SF_GET_INO4(di)		\
+	XFS_DIR2_SF_GET_INO4_ARCH(di,ARCH_NOCONVERT)
+
+typedef union {
+	xfs_dir2_ino8_t i8;
+	xfs_dir2_ino4_t i4;
+} xfs_dir2_inou_t;
+#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL)
+
+/*
+ * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
+ * Only need 16 bits, this is the byte offset into the single block form.
+ */
+typedef struct { __uint8_t i[2]; } xfs_dir2_sf_off_t;
+
+/*
+ * The parent directory has a dedicated field, and the self-pointer must
+ * be calculated on the fly.
+ *
+ * Entries are packed toward the top as tightly as possible.  The header
+ * and the elements must be bcopy()'d out into a work area to get correct
+ * alignment for the inode number fields.
+ */
+typedef struct xfs_dir2_sf_hdr {
+	__uint8_t		count;		/* count of entries */
+	__uint8_t		i8count;	/* count of 8-byte inode #s */
+	xfs_dir2_inou_t		parent;		/* parent dir inode number */
+} xfs_dir2_sf_hdr_t;
+
+typedef struct xfs_dir2_sf_entry {
+	__uint8_t		namelen;	/* actual name length */
+	xfs_dir2_sf_off_t	offset;		/* saved offset */
+	__uint8_t		name[1];	/* name, variable size */
+	xfs_dir2_inou_t		inumber;	/* inode number, var. offset */
+} xfs_dir2_sf_entry_t;
+
+typedef struct xfs_dir2_sf {
+	xfs_dir2_sf_hdr_t	hdr;		/* shortform header */
+	xfs_dir2_sf_entry_t	list[1];	/* shortform entries */
+} xfs_dir2_sf_t;
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_HDR_SIZE)
+int xfs_dir2_sf_hdr_size(int i8count);
+#define XFS_DIR2_SF_HDR_SIZE(i8count)	xfs_dir2_sf_hdr_size(i8count)
+#else
+#define XFS_DIR2_SF_HDR_SIZE(i8count)	\
+	((uint)sizeof(xfs_dir2_sf_hdr_t) - \
+	 ((i8count) == 0) * \
+	 ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_INUMBERP)
+xfs_dir2_inou_t *xfs_dir2_sf_inumberp(xfs_dir2_sf_entry_t *sfep);
+#define XFS_DIR2_SF_INUMBERP(sfep)	xfs_dir2_sf_inumberp(sfep)
+#else
+#define XFS_DIR2_SF_INUMBERP(sfep)	\
+	((xfs_dir2_inou_t *)&(sfep)->name[(sfep)->namelen])
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_GET_INUMBER)
+xfs_intino_t xfs_dir2_sf_get_inumber_arch(xfs_dir2_sf_t *sfp, xfs_dir2_inou_t *from,
+					    xfs_arch_t arch);
+#define XFS_DIR2_SF_GET_INUMBER_ARCH(sfp, from, arch)	\
+	xfs_dir2_sf_get_inumber_arch(sfp, from, arch)
+
+#else
+#define XFS_DIR2_SF_GET_INUMBER_ARCH(sfp, from, arch)	\
+	((sfp)->hdr.i8count == 0 ? \
+		(xfs_intino_t)XFS_DIR2_SF_GET_INO4_ARCH(*(from), arch) : \
+		(xfs_intino_t)XFS_DIR2_SF_GET_INO8_ARCH(*(from), arch))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_PUT_INUMBER)
+void xfs_dir2_sf_put_inumber_arch(xfs_dir2_sf_t *sfp, xfs_ino_t *from,
+				     xfs_dir2_inou_t *to, xfs_arch_t arch);
+#define XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp,from,to,arch)	\
+	xfs_dir2_sf_put_inumber_arch(sfp,from,to,arch)
+#else
+#define XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp,from,to,arch)	\
+	if ((sfp)->hdr.i8count == 0) { \
+	    DIRINO4_COPY_ARCH(from,to,arch); \
+	} else { \
+	    DIRINO_COPY_ARCH(from,to,arch); \
+	}
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_GET_OFFSET)
+xfs_dir2_data_aoff_t xfs_dir2_sf_get_offset_arch(xfs_dir2_sf_entry_t *sfep,
+						    xfs_arch_t arch);
+xfs_dir2_data_aoff_t xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep);
+#define XFS_DIR2_SF_GET_OFFSET_ARCH(sfep,arch)	\
+	xfs_dir2_sf_get_offset_arch(sfep,arch)
+#else
+#define XFS_DIR2_SF_GET_OFFSET_ARCH(sfep,arch)	\
+	INT_GET_UNALIGNED_16_ARCH(&(sfep)->offset.i,arch)
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_PUT_OFFSET)
+void xfs_dir2_sf_put_offset_arch(xfs_dir2_sf_entry_t *sfep,
+				    xfs_dir2_data_aoff_t off, xfs_arch_t arch);
+#define XFS_DIR2_SF_PUT_OFFSET_ARCH(sfep,off,arch) \
+	xfs_dir2_sf_put_offset_arch(sfep,off,arch)
+#else
+#define XFS_DIR2_SF_PUT_OFFSET_ARCH(sfep,off,arch)	\
+	INT_SET_UNALIGNED_16_ARCH(&(sfep)->offset.i,off,arch)
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_ENTSIZE_BYNAME)
+int xfs_dir2_sf_entsize_byname(xfs_dir2_sf_t *sfp, int len);
+#define XFS_DIR2_SF_ENTSIZE_BYNAME(sfp,len)	\
+	xfs_dir2_sf_entsize_byname(sfp,len)
+#else
+#define XFS_DIR2_SF_ENTSIZE_BYNAME(sfp,len)	/* space a name uses */ \
+	((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (len) - \
+	 ((sfp)->hdr.i8count == 0) * \
+	 ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_ENTSIZE_BYENTRY)
+int xfs_dir2_sf_entsize_byentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep);
+#define XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep)	\
+	xfs_dir2_sf_entsize_byentry(sfp,sfep)
+#else
+#define XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep)	/* space an entry uses */ \
+	((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (sfep)->namelen - \
+	 ((sfp)->hdr.i8count == 0) * \
+	 ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_FIRSTENTRY)
+xfs_dir2_sf_entry_t *xfs_dir2_sf_firstentry(xfs_dir2_sf_t *sfp);
+#define XFS_DIR2_SF_FIRSTENTRY(sfp)	xfs_dir2_sf_firstentry(sfp)
+#else
+#define XFS_DIR2_SF_FIRSTENTRY(sfp)	/* first entry in struct */ \
+	((xfs_dir2_sf_entry_t *) \
+	 ((char *)(sfp) + XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR2_SF_NEXTENTRY)
+xfs_dir2_sf_entry_t *xfs_dir2_sf_nextentry(xfs_dir2_sf_t *sfp,
+					   xfs_dir2_sf_entry_t *sfep);
+#define XFS_DIR2_SF_NEXTENTRY(sfp,sfep)		xfs_dir2_sf_nextentry(sfp,sfep)
+#else
+#define XFS_DIR2_SF_NEXTENTRY(sfp,sfep)		/* next entry in struct */ \
+	((xfs_dir2_sf_entry_t *) \
+		((char *)(sfep) + XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep)))
+#endif
+
+/*
+ * Functions.
+ */
+
+extern int
+	xfs_dir2_block_sfsize(struct xfs_inode *dp,
+			      struct xfs_dir2_block *block,
+			      xfs_dir2_sf_hdr_t *sfhp);
+
+extern int
+	xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_dabuf *bp,
+			     int size, xfs_dir2_sf_hdr_t *sfhp);
+
+extern int
+	xfs_dir2_sf_addname(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
+
+extern int
+	xfs_dir2_sf_getdents(struct xfs_inode *dp, struct uio *uio, int *eofp,
+			     struct xfs_dirent *dbp, xfs_dir2_put_t put);
+
+extern int
+	xfs_dir2_sf_lookup(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_sf_removename(struct xfs_da_args *args);
+
+extern int
+	xfs_dir2_sf_replace(struct xfs_da_args *args);
+
+#endif	/* __XFS_DIR2_SF_H__ */
diff --git a/fs/xfs/xfs_dir2_trace.c b/fs/xfs/xfs_dir2_trace.c
new file mode 100644
index 000000000000..fe9280e1f427
--- /dev/null
+++ b/fs/xfs/xfs_dir2_trace.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * xfs_dir2_trace.c
+ * Tracing for xfs v2 directories.
+ */
+#include <xfs.h>
+
+
+#ifdef DEBUG
+ktrace_t	*xfs_dir2_trace_buf;
+#endif /* DEBUG */
+
+#ifdef XFS_DIR2_TRACE
+/*
+ * Enter something in the trace buffers.
+ */
+static void
+xfs_dir2_trace_enter(
+	xfs_inode_t	*dp,
+	int		type,
+	char		*where,
+	char		*name,
+	int		namelen,
+	__psunsigned_t	a0,
+	__psunsigned_t	a1,
+	__psunsigned_t	a2,
+	__psunsigned_t	a3,
+	__psunsigned_t	a4,
+	__psunsigned_t	a5,
+	__psunsigned_t	a6)
+{
+	__psunsigned_t	n[6];
+
+	ASSERT(xfs_dir2_trace_buf);
+	ASSERT(dp->i_dir_trace);
+	if (name)
+		bcopy(name, n, min(sizeof(n), namelen));
+	else
+		bzero((char *)n, sizeof(n));
+	ktrace_enter(xfs_dir2_trace_buf,
+		(void *)(__psunsigned_t)type, (void *)where,
+		(void *)a0, (void *)a1, (void *)a2, (void *)a3,
+		(void *)a4, (void *)a5, (void *)a6,
+		(void *)(__psunsigned_t)namelen,
+		(void *)n[0], (void *)n[1], (void *)n[2],
+		(void *)n[3], (void *)n[4], (void *)n[5]);
+	ktrace_enter(dp->i_dir_trace,
+		(void *)(__psunsigned_t)type, (void *)where,
+		(void *)a0, (void *)a1, (void *)a2, (void *)a3,
+		(void *)a4, (void *)a5, (void *)a6,
+		(void *)(__psunsigned_t)namelen,
+		(void *)n[0], (void *)n[1], (void *)n[2],
+		(void *)n[3], (void *)n[4], (void *)n[5]);
+}
+
+void
+xfs_dir2_trace_args(
+	char		*where,
+	xfs_da_args_t	*args)
+{
+	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS, where,
+		(char *)args->name, (int)args->namelen,
+		(__psunsigned_t)args->hashval, (__psunsigned_t)args->inumber,
+		(__psunsigned_t)args->dp, (__psunsigned_t)args->trans,
+		(__psunsigned_t)args->justcheck, 0, 0);
+}
+
+void
+xfs_dir2_trace_args_b(
+	char		*where,
+	xfs_da_args_t	*args,
+	xfs_dabuf_t	*bp)
+{
+	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_B, where,
+		(char *)args->name, (int)args->namelen,
+		(__psunsigned_t)args->hashval, (__psunsigned_t)args->inumber,
+		(__psunsigned_t)args->dp, (__psunsigned_t)args->trans,
+		(__psunsigned_t)args->justcheck,
+		(__psunsigned_t)(bp ? bp->bps[0] : NULL), 0);
+}
+
+void
+xfs_dir2_trace_args_bb(
+	char		*where,
+	xfs_da_args_t	*args,
+	xfs_dabuf_t	*lbp,
+	xfs_dabuf_t	*dbp)
+{
+	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_BB, where,
+		(char *)args->name, (int)args->namelen,
+		(__psunsigned_t)args->hashval, (__psunsigned_t)args->inumber,
+		(__psunsigned_t)args->dp, (__psunsigned_t)args->trans,
+		(__psunsigned_t)args->justcheck,
+		(__psunsigned_t)(lbp ? lbp->bps[0] : NULL),
+		(__psunsigned_t)(dbp ? dbp->bps[0] : NULL));
+}
+
+void
+xfs_dir2_trace_args_bibii(
+	char		*where,
+	xfs_da_args_t	*args,
+	xfs_dabuf_t	*bs,
+	int		ss,
+	xfs_dabuf_t	*bd,
+	int		sd,
+	int		c)
+{
+	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_BIBII, where,
+		(char *)args->name, (int)args->namelen,
+		(__psunsigned_t)args->dp, (__psunsigned_t)args->trans,
+		(__psunsigned_t)(bs ? bs->bps[0] : NULL), (__psunsigned_t)ss,
+		(__psunsigned_t)(bd ? bd->bps[0] : NULL), (__psunsigned_t)sd,
+		(__psunsigned_t)c);
+}
+
+void
+xfs_dir2_trace_args_db(
+	char		*where,
+	xfs_da_args_t	*args,
+	xfs_dir2_db_t	db,
+	xfs_dabuf_t	*bp)
+{
+	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_DB, where,
+		(char *)args->name, (int)args->namelen,
+		(__psunsigned_t)args->hashval, (__psunsigned_t)args->inumber,
+		(__psunsigned_t)args->dp, (__psunsigned_t)args->trans,
+		(__psunsigned_t)args->justcheck, (__psunsigned_t)db,
+		(__psunsigned_t)(bp ? bp->bps[0] : NULL));
+}
+
+void
+xfs_dir2_trace_args_i(
+	char		*where,
+	xfs_da_args_t	*args,
+	xfs_ino_t	i)
+{
+	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_I, where,
+		(char *)args->name, (int)args->namelen,
+		(__psunsigned_t)args->hashval, (__psunsigned_t)args->inumber,
+		(__psunsigned_t)args->dp, (__psunsigned_t)args->trans,
+		(__psunsigned_t)args->justcheck, (__psunsigned_t)i, 0);
+}
+
+void
+xfs_dir2_trace_args_s(
+	char		*where,
+	xfs_da_args_t	*args,
+	int		s)
+{
+	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_S, where,
+		(char *)args->name, (int)args->namelen,
+		(__psunsigned_t)args->hashval, (__psunsigned_t)args->inumber,
+		(__psunsigned_t)args->dp, (__psunsigned_t)args->trans,
+		(__psunsigned_t)args->justcheck, (__psunsigned_t)s, 0);
+}
+
+void
+xfs_dir2_trace_args_sb(
+	char		*where,
+	xfs_da_args_t	*args,
+	int		s,
+	xfs_dabuf_t	*bp)
+{
+	xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_SB, where,
+		(char *)args->name, (int)args->namelen,
+		(__psunsigned_t)args->hashval, (__psunsigned_t)args->inumber,
+		(__psunsigned_t)args->dp, (__psunsigned_t)args->trans,
+		(__psunsigned_t)args->justcheck, (__psunsigned_t)s,
+		(__psunsigned_t)(bp ? bp->bps[0] : NULL));
+}
+#endif	/* XFS_DIR2_TRACE */
diff --git a/fs/xfs/xfs_dir2_trace.h b/fs/xfs/xfs_dir2_trace.h
new file mode 100644
index 000000000000..96118532bbbf
--- /dev/null
+++ b/fs/xfs/xfs_dir2_trace.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR2_TRACE_H__
+#define __XFS_DIR2_TRACE_H__
+
+/*
+ * Tracing for xfs v2 directories.
+ */
+
+struct ktrace;
+struct xfs_dabuf;
+struct xfs_da_args;
+
+#ifdef XFS_ALL_TRACE
+#define XFS_DIR2_TRACE
+#endif	/* XFS_ALL_TRACE */
+
+#if !defined(DEBUG)
+#undef XFS_DIR2_TRACE
+#endif	/* !DEBUG */
+
+#define XFS_DIR2_GTRACE_SIZE		4096	/* global buffer */
+#define XFS_DIR2_KTRACE_SIZE		32	/* per-inode buffer */
+
+#define XFS_DIR2_KTRACE_ARGS		1	/* args only */
+#define XFS_DIR2_KTRACE_ARGS_B		2	/* args + buffer */
+#define XFS_DIR2_KTRACE_ARGS_BB		3	/* args + 2 buffers */
+#define XFS_DIR2_KTRACE_ARGS_DB		4	/* args, db, buffer */
+#define XFS_DIR2_KTRACE_ARGS_I		5	/* args, inum */
+#define XFS_DIR2_KTRACE_ARGS_S		6	/* args, int */
+#define XFS_DIR2_KTRACE_ARGS_SB		7	/* args, int, buffer */
+#define XFS_DIR2_KTRACE_ARGS_BIBII	8	/* args, buf/int/buf/int/int */
+
+#ifdef XFS_DIR2_TRACE
+
+void xfs_dir2_trace_args(char *where, struct xfs_da_args *args);
+void xfs_dir2_trace_args_b(char *where, struct xfs_da_args *args,
+			   struct xfs_dabuf *bp);
+void xfs_dir2_trace_args_bb(char *where, struct xfs_da_args *args,
+			    struct xfs_dabuf *lbp, struct xfs_dabuf *dbp);
+void xfs_dir2_trace_args_bibii(char *where, struct xfs_da_args *args,
+			       struct xfs_dabuf *bs, int ss,
+			       struct xfs_dabuf *bd, int sd, int c);
+void xfs_dir2_trace_args_db(char *where, struct xfs_da_args *args,
+			    xfs_dir2_db_t db, struct xfs_dabuf *bp);
+void xfs_dir2_trace_args_i(char *where, struct xfs_da_args *args, xfs_ino_t i);
+void xfs_dir2_trace_args_s(char *where, struct xfs_da_args *args, int s);
+void xfs_dir2_trace_args_sb(char *where, struct xfs_da_args *args, int s,
+			    struct xfs_dabuf *bp);
+
+#else	/* XFS_DIR2_TRACE */
+
+#define xfs_dir2_trace_args(where, args)
+#define xfs_dir2_trace_args_b(where, args, bp)
+#define xfs_dir2_trace_args_bb(where, args, lbp, dbp)
+#define xfs_dir2_trace_args_bibii(where, args, bs, ss, bd, sd, c)
+#define xfs_dir2_trace_args_db(where, args, db, bp)
+#define xfs_dir2_trace_args_i(where, args, i)
+#define xfs_dir2_trace_args_s(where, args, s)
+#define xfs_dir2_trace_args_sb(where, args, s, bp)
+
+#endif	/* XFS_DIR2_TRACE */
+
+extern struct ktrace *xfs_dir2_trace_buf;
+
+#endif	/* __XFS_DIR2_TRACE_H__ */
diff --git a/fs/xfs/xfs_dir_leaf.c b/fs/xfs/xfs_dir_leaf.c
new file mode 100644
index 000000000000..1aceaf37693d
--- /dev/null
+++ b/fs/xfs/xfs_dir_leaf.c
@@ -0,0 +1,2224 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * xfs_dir_leaf.c
+ *
+ * GROT: figure out how to recover gracefully when bmap returns ENOSPC.
+ */
+
+#include <xfs.h>
+
+
+/*
+ * xfs_dir_leaf.c
+ *
+ * Routines to implement leaf blocks of directories as Btrees of hashed names.
+ */
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Routines used for growing the Btree.
+ */
+STATIC void xfs_dir_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args,
+					      int insertion_index,
+					      int freemap_index);
+STATIC int xfs_dir_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer,
+					    int musthave, int justcheck);
+STATIC void xfs_dir_leaf_rebalance(xfs_da_state_t *state,
+						  xfs_da_state_blk_t *blk1,
+						  xfs_da_state_blk_t *blk2);
+STATIC int xfs_dir_leaf_figure_balance(xfs_da_state_t *state,
+					  xfs_da_state_blk_t *leaf_blk_1,
+					  xfs_da_state_blk_t *leaf_blk_2,
+					  int *number_entries_in_blk1,
+					  int *number_namebytes_in_blk1);
+
+/*
+ * Utility routines.
+ */
+STATIC void xfs_dir_leaf_moveents(xfs_dir_leafblock_t *src_leaf,
+					      int src_start,
+					      xfs_dir_leafblock_t *dst_leaf,
+					      int dst_start, int move_count,
+					      xfs_mount_t *mp);
+
+
+/*========================================================================
+ * External routines when dirsize < XFS_IFORK_DSIZE(dp).
+ *========================================================================*/
+
+
+/*
+ * Validate a given inode number.
+ */
+int
+xfs_dir_ino_validate(xfs_mount_t *mp, xfs_ino_t ino)
+{
+	xfs_agblock_t	agblkno;
+	xfs_agino_t	agino;
+	xfs_agnumber_t	agno;
+	int		ino_ok;
+	int		ioff;
+
+	agno = XFS_INO_TO_AGNO(mp, ino);
+	agblkno = XFS_INO_TO_AGBNO(mp, ino);
+	ioff = XFS_INO_TO_OFFSET(mp, ino);
+	agino = XFS_OFFBNO_TO_AGINO(mp, agblkno, ioff);
+	ino_ok =
+		agno < mp->m_sb.sb_agcount &&
+		agblkno < mp->m_sb.sb_agblocks &&
+		agblkno != 0 &&
+		ioff < (1 << mp->m_sb.sb_inopblog) &&
+		XFS_AGINO_TO_INO(mp, agno, agino) == ino;
+	if (XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
+			XFS_RANDOM_DIR_INO_VALIDATE)) {
+		xfs_fs_cmn_err(CE_WARN, mp, "Invalid inode number 0x%Lx\n",
+				(unsigned long long) ino);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	return 0;
+}
+
+/*
+ * Create the initial contents of a shortform directory.
+ */
+int
+xfs_dir_shortform_create(xfs_da_args_t *args, xfs_ino_t parent)
+{
+	xfs_dir_sf_hdr_t *hdr;
+	xfs_inode_t *dp;
+
+	dp = args->dp;
+	ASSERT(dp != NULL);
+	ASSERT(dp->i_d.di_size == 0);
+	if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) {
+		dp->i_df.if_flags &= ~XFS_IFEXTENTS;	/* just in case */
+		dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+		xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE);
+		dp->i_df.if_flags |= XFS_IFINLINE;
+	}
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	ASSERT(dp->i_df.if_bytes == 0);
+	xfs_idata_realloc(dp, sizeof(*hdr), XFS_DATA_FORK);
+	hdr = (xfs_dir_sf_hdr_t *)dp->i_df.if_u1.if_data;
+	XFS_DIR_SF_PUT_DIRINO_ARCH(&parent, &hdr->parent, ARCH_CONVERT);
+
+	INT_ZERO(hdr->count, ARCH_CONVERT);
+	dp->i_d.di_size = sizeof(*hdr);
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+	return(0);
+}
+
+/*
+ * Add a name to the shortform directory structure.
+ * Overflow from the inode has already been checked for.
+ */
+int
+xfs_dir_shortform_addname(xfs_da_args_t *args)
+{
+	xfs_dir_shortform_t *sf;
+	xfs_dir_sf_entry_t *sfe;
+	int i, offset, size;
+	xfs_inode_t *dp;
+
+	dp = args->dp;
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Catch the case where the conversion from shortform to leaf
+	 * failed part way through.
+	 */
+	if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
+	sfe = &sf->list[0];
+	for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) {
+		if (sfe->namelen == args->namelen &&
+		    args->name[0] == sfe->name[0] &&
+		    bcmp(args->name, sfe->name, args->namelen) == 0)
+			return(XFS_ERROR(EEXIST));
+		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
+	}
+
+	offset = (int)((char *)sfe - (char *)sf);
+	size = XFS_DIR_SF_ENTSIZE_BYNAME(args->namelen);
+	xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
+	sfe = (xfs_dir_sf_entry_t *)((char *)sf + offset);
+
+	XFS_DIR_SF_PUT_DIRINO_ARCH(&args->inumber, &sfe->inumber, ARCH_CONVERT);
+	sfe->namelen = args->namelen;
+	bcopy(args->name, sfe->name, sfe->namelen);
+	INT_MOD(sf->hdr.count, ARCH_CONVERT, +1);
+
+	dp->i_d.di_size += size;
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+
+	return(0);
+}
+
+/*
+ * Remove a name from the shortform directory structure.
+ */
+int
+xfs_dir_shortform_removename(xfs_da_args_t *args)
+{
+	xfs_dir_shortform_t *sf;
+	xfs_dir_sf_entry_t *sfe;
+	int base, size = 0, i;
+	xfs_inode_t *dp;
+
+	dp = args->dp;
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Catch the case where the conversion from shortform to leaf
+	 * failed part way through.
+	 */
+	if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	base = sizeof(xfs_dir_sf_hdr_t);
+	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
+	sfe = &sf->list[0];
+	for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) {
+		size = XFS_DIR_SF_ENTSIZE_BYENTRY(sfe);
+		if (sfe->namelen == args->namelen &&
+		    sfe->name[0] == args->name[0] &&
+		    bcmp(sfe->name, args->name, args->namelen) == 0)
+			break;
+		base += size;
+		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
+	}
+	if (i < 0) {
+		ASSERT(args->oknoent);
+		return(XFS_ERROR(ENOENT));
+	}
+
+	if ((base + size) != dp->i_d.di_size) {
+		ovbcopy(&((char *)sf)[base+size], &((char *)sf)[base],
+					      dp->i_d.di_size - (base+size));
+	}
+	INT_MOD(sf->hdr.count, ARCH_CONVERT, -1);
+
+	xfs_idata_realloc(dp, -size, XFS_DATA_FORK);
+	dp->i_d.di_size -= size;
+	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
+
+	return(0);
+}
+
+/*
+ * Look up a name in a shortform directory structure.
+ */
+int
+xfs_dir_shortform_lookup(xfs_da_args_t *args)
+{
+	xfs_dir_shortform_t *sf;
+	xfs_dir_sf_entry_t *sfe;
+	int i;
+	xfs_inode_t *dp;
+
+	dp = args->dp;
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Catch the case where the conversion from shortform to leaf
+	 * failed part way through.
+	 */
+	if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
+	if (args->namelen == 2 &&
+	    args->name[0] == '.' && args->name[1] == '.') {
+		XFS_DIR_SF_GET_DIRINO_ARCH(&sf->hdr.parent, &args->inumber, ARCH_CONVERT);
+		return(XFS_ERROR(EEXIST));
+	}
+	if (args->namelen == 1 && args->name[0] == '.') {
+		args->inumber = dp->i_ino;
+		return(XFS_ERROR(EEXIST));
+	}
+	sfe = &sf->list[0];
+	for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) {
+		if (sfe->namelen == args->namelen &&
+		    sfe->name[0] == args->name[0] &&
+		    bcmp(args->name, sfe->name, args->namelen) == 0) {
+			XFS_DIR_SF_GET_DIRINO_ARCH(&sfe->inumber, &args->inumber, ARCH_CONVERT);
+			return(XFS_ERROR(EEXIST));
+		}
+		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
+	}
+	ASSERT(args->oknoent);
+	return(XFS_ERROR(ENOENT));
+}
+
+/*
+ * Convert from using the shortform to the leaf.
+ */
+int
+xfs_dir_shortform_to_leaf(xfs_da_args_t *iargs)
+{
+	xfs_inode_t *dp;
+	xfs_dir_shortform_t *sf;
+	xfs_dir_sf_entry_t *sfe;
+	xfs_da_args_t args;
+	xfs_ino_t inumber;
+	char *tmpbuffer;
+	int retval, i, size;
+	xfs_dablk_t blkno;
+	xfs_dabuf_t *bp;
+
+	dp = iargs->dp;
+	/*
+	 * Catch the case where the conversion from shortform to leaf
+	 * failed part way through.
+	 */
+	if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	size = dp->i_df.if_bytes;
+	tmpbuffer = kmem_alloc(size, KM_SLEEP);
+	ASSERT(tmpbuffer != NULL);
+
+	bcopy(dp->i_df.if_u1.if_data, tmpbuffer, size);
+
+	sf = (xfs_dir_shortform_t *)tmpbuffer;
+	XFS_DIR_SF_GET_DIRINO_ARCH(&sf->hdr.parent, &inumber, ARCH_CONVERT);
+
+	xfs_idata_realloc(dp, -size, XFS_DATA_FORK);
+	dp->i_d.di_size = 0;
+	xfs_trans_log_inode(iargs->trans, dp, XFS_ILOG_CORE);
+	retval = xfs_da_grow_inode(iargs, &blkno);
+	if (retval)
+		goto out;
+
+	ASSERT(blkno == 0);
+	retval = xfs_dir_leaf_create(iargs, blkno, &bp);
+	if (retval)
+		goto out;
+	xfs_da_buf_done(bp);
+
+	args.name = ".";
+	args.namelen = 1;
+	args.hashval = xfs_dir_hash_dot;
+	args.inumber = dp->i_ino;
+	args.dp = dp;
+	args.firstblock = iargs->firstblock;
+	args.flist = iargs->flist;
+	args.total = iargs->total;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = iargs->trans;
+	args.justcheck = 0;
+	args.addname = args.oknoent = 1;
+	retval = xfs_dir_leaf_addname(&args);
+	if (retval)
+		goto out;
+
+	args.name = "..";
+	args.namelen = 2;
+	args.hashval = xfs_dir_hash_dotdot;
+	args.inumber = inumber;
+	retval = xfs_dir_leaf_addname(&args);
+	if (retval)
+		goto out;
+
+	sfe = &sf->list[0];
+	for (i = 0; i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
+		args.name = (char *)(sfe->name);
+		args.namelen = sfe->namelen;
+		args.hashval = xfs_da_hashname((char *)(sfe->name),
+					       sfe->namelen);
+		XFS_DIR_SF_GET_DIRINO_ARCH(&sfe->inumber, &args.inumber, ARCH_CONVERT);
+		retval = xfs_dir_leaf_addname(&args);
+		if (retval)
+			goto out;
+		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
+	}
+	retval = 0;
+
+out:
+	kmem_free(tmpbuffer, size);
+	return(retval);
+}
+
+STATIC int
+xfs_dir_shortform_compare(const void *a, const void *b)
+{
+	xfs_dir_sf_sort_t *sa, *sb;
+
+	sa = (xfs_dir_sf_sort_t *)a;
+	sb = (xfs_dir_sf_sort_t *)b;
+	if (sa->hash < sb->hash)
+		return -1;
+	else if (sa->hash > sb->hash)
+		return 1;
+	else
+		return sa->entno - sb->entno;
+}
+
+/*
+ * Copy out directory entries for getdents(), for shortform directories.
+ */
+/*ARGSUSED*/
+int
+xfs_dir_shortform_getdents(xfs_inode_t *dp, uio_t *uio, int *eofp,
+				       xfs_dirent_t *dbp, xfs_dir_put_t put)
+{
+	xfs_dir_shortform_t *sf;
+	xfs_dir_sf_entry_t *sfe;
+	int retval, i, sbsize, nsbuf, lastresid=0, want_entno;
+	xfs_mount_t *mp;
+	xfs_dahash_t cookhash, hash;
+	xfs_dir_put_args_t p;
+	xfs_dir_sf_sort_t *sbuf, *sbp;
+
+	mp = dp->i_mount;
+	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
+	cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
+	want_entno = XFS_DA_COOKIE_ENTRY(mp, uio->uio_offset);
+	nsbuf = INT_GET(sf->hdr.count, ARCH_CONVERT) + 2;
+	sbsize = (nsbuf + 1) * sizeof(*sbuf);
+	sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP);
+
+	xfs_dir_trace_g_du("sf: start", dp, uio);
+
+	/*
+	 * Collect all the entries into the buffer.
+	 * Entry 0 is .
+	 */
+	sbp->entno = 0;
+	sbp->seqno = 0;
+	sbp->hash = xfs_dir_hash_dot;
+	sbp->ino = dp->i_ino;
+	sbp->name = ".";
+	sbp->namelen = 1;
+	sbp++;
+
+	/*
+	 * Entry 1 is ..
+	 */
+	sbp->entno = 1;
+	sbp->seqno = 0;
+	sbp->hash = xfs_dir_hash_dotdot;
+	sbp->ino = XFS_GET_DIR_INO_ARCH(mp, sf->hdr.parent, ARCH_CONVERT);
+	sbp->name = "..";
+	sbp->namelen = 2;
+	sbp++;
+
+	/*
+	 * Scan the directory data for the rest of the entries.
+	 */
+	for (i = 0, sfe = &sf->list[0];
+			i < INT_GET(sf->hdr.count, ARCH_CONVERT); i++) {
+
+		if (((char *)sfe < (char *)sf) ||
+		    ((char *)sfe >= ((char *)sf + dp->i_df.if_bytes)) ||
+		    (sfe->namelen >= MAXNAMELEN)) {
+			xfs_dir_trace_g_du("sf: corrupted", dp, uio);
+			kmem_free(sbuf, sbsize);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+
+		sbp->entno = i + 2;
+		sbp->seqno = 0;
+		sbp->hash = xfs_da_hashname((char *)sfe->name, sfe->namelen);
+		sbp->ino = XFS_GET_DIR_INO_ARCH(mp, sfe->inumber, ARCH_CONVERT);
+		sbp->name = (char *)sfe->name;
+		sbp->namelen = sfe->namelen;
+		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
+		sbp++;
+	}
+
+	/*
+	 * Sort the entries on hash then entno.
+	 */
+	qsort(sbuf, nsbuf, sizeof(*sbuf), xfs_dir_shortform_compare);
+	/*
+	 * Stuff in last entry.
+	 */
+	sbp->entno = nsbuf;
+	sbp->hash = XFS_DA_MAXHASH;
+	sbp->seqno = 0;
+	/*
+	 * Figure out the sequence numbers in case there's a hash duplicate.
+	 */
+	for (hash = sbuf->hash, sbp = sbuf + 1;
+				sbp < &sbuf[nsbuf + 1]; sbp++) {
+		if (sbp->hash == hash)
+			sbp->seqno = sbp[-1].seqno + 1;
+		else
+			hash = sbp->hash;
+	}
+
+	/*
+	 * Set up put routine.
+	 */
+	p.dbp = dbp;
+	p.put = put;
+	p.uio = uio;
+
+	/*
+	 * Find our place.
+	 */
+	for (sbp = sbuf; sbp < &sbuf[nsbuf + 1]; sbp++) {
+		if (sbp->hash > cookhash ||
+		    (sbp->hash == cookhash && sbp->seqno >= want_entno))
+			break;
+	}
+
+	/*
+	 * Did we fail to find anything?  We stop at the last entry,
+	 * the one we put maxhash into.
+	 */
+	if (sbp == &sbuf[nsbuf]) {
+		kmem_free(sbuf, sbsize);
+		xfs_dir_trace_g_du("sf: hash beyond end", dp, uio);
+		uio->uio_offset = XFS_DA_MAKE_COOKIE(mp, 0, 0, XFS_DA_MAXHASH);
+		*eofp = 1;
+		return 0;
+	}
+
+	/*
+	 * Loop putting entries into the user buffer.
+	 */
+	while (sbp < &sbuf[nsbuf]) {
+		/*
+		 * Save the first resid in a run of equal-hashval entries
+		 * so that we can back them out if they don't all fit.
+		 */
+		if (sbp->seqno == 0 || sbp == sbuf)
+			lastresid = uio->uio_resid;
+		/*
+		 * NOTE! Linux "filldir" semantics require that the
+		 *	 offset "cookie" be for this entry, not the
+		 *	 next; all the actual shuffling to make it
+		 *	 "look right" to the user is done in filldir.
+		 */
+		XFS_PUT_COOKIE(p.cook, mp, 0, sbp->seqno, sbp->hash);
+
+#if XFS_BIG_FILESYSTEMS
+		p.ino = sbp->ino + mp->m_inoadd;
+#else
+		p.ino = sbp->ino;
+#endif
+		p.name = sbp->name;
+		p.namelen = sbp->namelen;
+
+		retval = p.put(&p);
+
+		if (!p.done) {
+			uio->uio_offset =
+				XFS_DA_MAKE_COOKIE(mp, 0, 0, sbp->hash);
+			kmem_free(sbuf, sbsize);
+			uio->uio_resid = lastresid;
+			xfs_dir_trace_g_du("sf: E-O-B", dp, uio);
+			return retval;
+		}
+
+		sbp++;
+	}
+
+	kmem_free(sbuf, sbsize);
+
+	XFS_PUT_COOKIE(p.cook, mp, 0, 0, XFS_DA_MAXHASH);
+
+	uio->uio_offset = p.cook.o;
+
+	*eofp = 1;
+
+	xfs_dir_trace_g_du("sf: E-O-F", dp, uio);
+
+	return 0;
+}
+
+/*
+ * Look up a name in a shortform directory structure, replace the inode number.
+ */
+int
+xfs_dir_shortform_replace(xfs_da_args_t *args)
+{
+	xfs_dir_shortform_t *sf;
+	xfs_dir_sf_entry_t *sfe;
+	xfs_inode_t *dp;
+	int i;
+
+	dp = args->dp;
+	ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
+	/*
+	 * Catch the case where the conversion from shortform to leaf
+	 * failed part way through.
+	 */
+	if (dp->i_d.di_size < sizeof(xfs_dir_sf_hdr_t)) {
+		ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
+	ASSERT(dp->i_df.if_u1.if_data != NULL);
+	sf = (xfs_dir_shortform_t *)dp->i_df.if_u1.if_data;
+	if (args->namelen == 2 &&
+	    args->name[0] == '.' && args->name[1] == '.') {
+		/* XXX - replace assert? */
+		XFS_DIR_SF_PUT_DIRINO_ARCH(&args->inumber, &sf->hdr.parent, ARCH_CONVERT);
+		xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
+		return(0);
+	}
+	ASSERT(args->namelen != 1 || args->name[0] != '.');
+	sfe = &sf->list[0];
+	for (i = INT_GET(sf->hdr.count, ARCH_CONVERT)-1; i >= 0; i--) {
+		if (sfe->namelen == args->namelen &&
+		    sfe->name[0] == args->name[0] &&
+		    bcmp(args->name, sfe->name, args->namelen) == 0) {
+			ASSERT(bcmp((char *)&args->inumber,
+				(char *)&sfe->inumber, sizeof(xfs_ino_t)));
+			XFS_DIR_SF_PUT_DIRINO_ARCH(&args->inumber, &sfe->inumber, ARCH_CONVERT);
+			xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
+			return(0);
+		}
+		sfe = XFS_DIR_SF_NEXTENTRY(sfe);
+	}
+	ASSERT(args->oknoent);
+	return(XFS_ERROR(ENOENT));
+}
+
+/*
+ * Convert a leaf directory to shortform structure
+ */
+int
+xfs_dir_leaf_to_shortform(xfs_da_args_t *iargs)
+{
+	xfs_dir_leafblock_t *leaf;
+	xfs_dir_leaf_hdr_t *hdr;
+	xfs_dir_leaf_entry_t *entry;
+	xfs_dir_leaf_name_t *namest;
+	xfs_da_args_t args;
+	xfs_inode_t *dp;
+	xfs_ino_t parent;
+	char *tmpbuffer;
+	int retval, i;
+	xfs_dabuf_t *bp;
+
+	dp = iargs->dp;
+	tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
+	ASSERT(tmpbuffer != NULL);
+
+	retval = xfs_da_read_buf(iargs->trans, iargs->dp, 0, -1, &bp,
+					       XFS_DATA_FORK);
+	if (retval)
+		return(retval);
+	ASSERT(bp != NULL);
+	bcopy(bp->data, tmpbuffer, XFS_LBSIZE(dp->i_mount));
+	leaf = (xfs_dir_leafblock_t *)tmpbuffer;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	bzero(bp->data, XFS_LBSIZE(dp->i_mount));
+
+	/*
+	 * Find and special case the parent inode number
+	 */
+	hdr = &leaf->hdr;
+	entry = &leaf->entries[0];
+	for (i = INT_GET(hdr->count, ARCH_CONVERT)-1; i >= 0; entry++, i--) {
+		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+		if ((entry->namelen == 2) &&
+		    (namest->name[0] == '.') &&
+		    (namest->name[1] == '.')) {
+			XFS_DIR_SF_GET_DIRINO_ARCH(&namest->inumber, &parent, ARCH_CONVERT);
+			INT_ZERO(entry->nameidx, ARCH_CONVERT);
+		} else if ((entry->namelen == 1) && (namest->name[0] == '.')) {
+			INT_ZERO(entry->nameidx, ARCH_CONVERT);
+		}
+	}
+	retval = xfs_da_shrink_inode(iargs, 0, bp);
+	if (retval)
+		goto out;
+	retval = xfs_dir_shortform_create(iargs, parent);
+	if (retval)
+		goto out;
+
+	/*
+	 * Copy the rest of the filenames
+	 */
+	entry = &leaf->entries[0];
+	args.dp = dp;
+	args.firstblock = iargs->firstblock;
+	args.flist = iargs->flist;
+	args.total = iargs->total;
+	args.whichfork = XFS_DATA_FORK;
+	args.trans = iargs->trans;
+	args.justcheck = 0;
+	args.addname = args.oknoent = 1;
+	for (i = 0; i < INT_GET(hdr->count, ARCH_CONVERT); entry++, i++) {
+		if (INT_ISZERO(entry->nameidx, ARCH_CONVERT))
+			continue;
+		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+		args.name = (char *)(namest->name);
+		args.namelen = entry->namelen;
+		args.hashval = INT_GET(entry->hashval, ARCH_CONVERT);
+		XFS_DIR_SF_GET_DIRINO_ARCH(&namest->inumber, &args.inumber, ARCH_CONVERT);
+		xfs_dir_shortform_addname(&args);
+	}
+
+out:
+	kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount));
+	return(retval);
+}
+
+/*
+ * Convert from using a single leaf to a root node and a leaf.
+ */
+int
+xfs_dir_leaf_to_node(xfs_da_args_t *args)
+{
+	xfs_dir_leafblock_t *leaf;
+	xfs_da_intnode_t *node;
+	xfs_inode_t *dp;
+	xfs_dabuf_t *bp1, *bp2;
+	xfs_dablk_t blkno;
+	int retval;
+
+	dp = args->dp;
+	retval = xfs_da_grow_inode(args, &blkno);
+	ASSERT(blkno == 1);
+	if (retval)
+		return(retval);
+	retval = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1,
+					      XFS_DATA_FORK);
+	if (retval)
+		return(retval);
+	ASSERT(bp1 != NULL);
+	retval = xfs_da_get_buf(args->trans, args->dp, 1, -1, &bp2,
+					     XFS_DATA_FORK);
+	if (retval) {
+		xfs_da_buf_done(bp1);
+		return(retval);
+	}
+	ASSERT(bp2 != NULL);
+	bcopy(bp1->data, bp2->data, XFS_LBSIZE(dp->i_mount));
+	xfs_da_buf_done(bp1);
+	xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
+
+	/*
+	 * Set up the new root node.
+	 */
+	retval = xfs_da_node_create(args, 0, 1, &bp1, XFS_DATA_FORK);
+	if (retval) {
+		xfs_da_buf_done(bp2);
+		return(retval);
+	}
+	node = bp1->data;
+	leaf = bp2->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	INT_SET(node->btree[0].hashval, ARCH_CONVERT, INT_GET(leaf->entries[ INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT));
+	xfs_da_buf_done(bp2);
+	INT_SET(node->btree[0].before, ARCH_CONVERT, blkno);
+	INT_SET(node->hdr.count, ARCH_CONVERT, 1);
+	xfs_da_log_buf(args->trans, bp1,
+		XFS_DA_LOGRANGE(node, &node->btree[0], sizeof(node->btree[0])));
+	xfs_da_buf_done(bp1);
+
+	return(retval);
+}
+
+
+/*========================================================================
+ * Routines used for growing the Btree.
+ *========================================================================*/
+
+/*
+ * Create the initial contents of a leaf directory
+ * or a leaf in a node directory.
+ */
+int
+xfs_dir_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp)
+{
+	xfs_dir_leafblock_t *leaf;
+	xfs_dir_leaf_hdr_t *hdr;
+	xfs_inode_t *dp;
+	xfs_dabuf_t *bp;
+	int retval;
+
+	dp = args->dp;
+	ASSERT(dp != NULL);
+	retval = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp, XFS_DATA_FORK);
+	if (retval)
+		return(retval);
+	ASSERT(bp != NULL);
+	leaf = bp->data;
+	bzero((char *)leaf, XFS_LBSIZE(dp->i_mount));
+	hdr = &leaf->hdr;
+	INT_SET(hdr->info.magic, ARCH_CONVERT, XFS_DIR_LEAF_MAGIC);
+	INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount));
+	if (INT_ISZERO(hdr->firstused, ARCH_CONVERT))
+		INT_SET(hdr->firstused, ARCH_CONVERT, XFS_LBSIZE(dp->i_mount) - 1);
+	INT_SET(hdr->freemap[0].base, ARCH_CONVERT, sizeof(xfs_dir_leaf_hdr_t));
+	INT_SET(hdr->freemap[0].size, ARCH_CONVERT, INT_GET(hdr->firstused, ARCH_CONVERT) - INT_GET(hdr->freemap[0].base, ARCH_CONVERT));
+
+	xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1);
+
+	*bpp = bp;
+	return(0);
+}
+
+/*
+ * Split the leaf node, rebalance, then add the new entry.
+ */
+int
+xfs_dir_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk,
+				  xfs_da_state_blk_t *newblk)
+{
+	xfs_dablk_t blkno;
+	xfs_da_args_t *args;
+	int error;
+
+	/*
+	 * Allocate space for a new leaf node.
+	 */
+	args = state->args;
+	ASSERT(args != NULL);
+	ASSERT(oldblk->magic == XFS_DIR_LEAF_MAGIC);
+	error = xfs_da_grow_inode(args, &blkno);
+	if (error)
+		return(error);
+	error = xfs_dir_leaf_create(args, blkno, &newblk->bp);
+	if (error)
+		return(error);
+	newblk->blkno = blkno;
+	newblk->magic = XFS_DIR_LEAF_MAGIC;
+
+	/*
+	 * Rebalance the entries across the two leaves.
+	 */
+	xfs_dir_leaf_rebalance(state, oldblk, newblk);
+	error = xfs_da_blk_link(state, oldblk, newblk);
+	if (error)
+		return(error);
+
+	/*
+	 * Insert the new entry in the correct block.
+	 */
+	if (state->inleaf) {
+		error = xfs_dir_leaf_add(oldblk->bp, args, oldblk->index);
+	} else {
+		error = xfs_dir_leaf_add(newblk->bp, args, newblk->index);
+	}
+
+	/*
+	 * Update last hashval in each block since we added the name.
+	 */
+	oldblk->hashval = xfs_dir_leaf_lasthash(oldblk->bp, NULL);
+	newblk->hashval = xfs_dir_leaf_lasthash(newblk->bp, NULL);
+	return(error);
+}
+
+/*
+ * Add a name to the leaf directory structure.
+ *
+ * Must take into account fragmented leaves and leaves where spacemap has
+ * lost some freespace information (ie: holes).
+ */
+int
+xfs_dir_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index)
+{
+	xfs_dir_leafblock_t *leaf;
+	xfs_dir_leaf_hdr_t *hdr;
+	xfs_dir_leaf_map_t *map;
+	int tablesize, entsize, sum, i, tmp, error;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	ASSERT((index >= 0) && (index <= INT_GET(leaf->hdr.count, ARCH_CONVERT)));
+	hdr = &leaf->hdr;
+	entsize = XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen);
+
+	/*
+	 * Search through freemap for first-fit on new name length.
+	 * (may need to figure in size of entry struct too)
+	 */
+	tablesize = (INT_GET(hdr->count, ARCH_CONVERT) + 1) * (uint)sizeof(xfs_dir_leaf_entry_t)
+			+ (uint)sizeof(xfs_dir_leaf_hdr_t);
+	map = &hdr->freemap[XFS_DIR_LEAF_MAPSIZE-1];
+	for (sum = 0, i = XFS_DIR_LEAF_MAPSIZE-1; i >= 0; map--, i--) {
+		if (tablesize > INT_GET(hdr->firstused, ARCH_CONVERT)) {
+			sum += INT_GET(map->size, ARCH_CONVERT);
+			continue;
+		}
+		if (INT_ISZERO(map->size, ARCH_CONVERT))
+			continue;	/* no space in this map */
+		tmp = entsize;
+		if (INT_GET(map->base, ARCH_CONVERT) < INT_GET(hdr->firstused, ARCH_CONVERT))
+			tmp += (uint)sizeof(xfs_dir_leaf_entry_t);
+		if (INT_GET(map->size, ARCH_CONVERT) >= tmp) {
+			if (!args->justcheck)
+				xfs_dir_leaf_add_work(bp, args, index, i);
+			return(0);
+		}
+		sum += INT_GET(map->size, ARCH_CONVERT);
+	}
+
+	/*
+	 * If there are no holes in the address space of the block,
+	 * and we don't have enough freespace, then compaction will do us
+	 * no good and we should just give up.
+	 */
+	if (!hdr->holes && (sum < entsize))
+		return(XFS_ERROR(ENOSPC));
+
+	/*
+	 * Compact the entries to coalesce free space.
+	 * Pass the justcheck flag so the checking pass can return
+	 * an error, without changing anything, if it won't fit.
+	 */
+	error = xfs_dir_leaf_compact(args->trans, bp,
+			args->total == 0 ?
+				entsize +
+				(uint)sizeof(xfs_dir_leaf_entry_t) : 0,
+			args->justcheck);
+	if (error)
+		return(error);
+	/*
+	 * After compaction, the block is guaranteed to have only one
+	 * free region, in freemap[0].	If it is not big enough, give up.
+	 */
+	if (INT_GET(hdr->freemap[0].size, ARCH_CONVERT) <
+	    (entsize + (uint)sizeof(xfs_dir_leaf_entry_t)))
+		return(XFS_ERROR(ENOSPC));
+
+	if (!args->justcheck)
+		xfs_dir_leaf_add_work(bp, args, index, 0);
+	return(0);
+}
+
+/*
+ * Add a name to a leaf directory structure.
+ */
+STATIC void
+xfs_dir_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int index,
+		      int mapindex)
+{
+	xfs_dir_leafblock_t *leaf;
+	xfs_dir_leaf_hdr_t *hdr;
+	xfs_dir_leaf_entry_t *entry;
+	xfs_dir_leaf_name_t *namest;
+	xfs_dir_leaf_map_t *map;
+	/* REFERENCED */
+	xfs_mount_t *mp;
+	int tmp, i;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	hdr = &leaf->hdr;
+	ASSERT((mapindex >= 0) && (mapindex < XFS_DIR_LEAF_MAPSIZE));
+	ASSERT((index >= 0) && (index <= INT_GET(hdr->count, ARCH_CONVERT)));
+
+	/*
+	 * Force open some space in the entry array and fill it in.
+	 */
+	entry = &leaf->entries[index];
+	if (index < INT_GET(hdr->count, ARCH_CONVERT)) {
+		tmp  = INT_GET(hdr->count, ARCH_CONVERT) - index;
+		tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
+		ovbcopy(entry, entry + 1, tmp);
+		xfs_da_log_buf(args->trans, bp,
+		    XFS_DA_LOGRANGE(leaf, entry, tmp + (uint)sizeof(*entry)));
+	}
+	INT_MOD(hdr->count, ARCH_CONVERT, +1);
+
+	/*
+	 * Allocate space for the new string (at the end of the run).
+	 */
+	map = &hdr->freemap[mapindex];
+	mp = args->trans->t_mountp;
+	ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
+	ASSERT(INT_GET(map->size, ARCH_CONVERT) >= XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen));
+	ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
+	INT_MOD(map->size, ARCH_CONVERT, -(XFS_DIR_LEAF_ENTSIZE_BYNAME(args->namelen)));
+	INT_SET(entry->nameidx, ARCH_CONVERT, INT_GET(map->base, ARCH_CONVERT) + INT_GET(map->size, ARCH_CONVERT));
+	INT_SET(entry->hashval, ARCH_CONVERT, args->hashval);
+	entry->namelen = args->namelen;
+	xfs_da_log_buf(args->trans, bp,
+	    XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
+
+	/*
+	 * Copy the string and inode number into the new space.
+	 */
+	namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+	XFS_DIR_SF_PUT_DIRINO_ARCH(&args->inumber, &namest->inumber, ARCH_CONVERT);
+	bcopy(args->name, namest->name, args->namelen);
+	xfs_da_log_buf(args->trans, bp,
+	    XFS_DA_LOGRANGE(leaf, namest, XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry)));
+
+	/*
+	 * Update the control info for this leaf node
+	 */
+	if (INT_GET(entry->nameidx, ARCH_CONVERT) < INT_GET(hdr->firstused, ARCH_CONVERT))
+		INT_COPY(hdr->firstused, entry->nameidx, ARCH_CONVERT);
+	ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT) >= ((INT_GET(hdr->count, ARCH_CONVERT)*sizeof(*entry))+sizeof(*hdr)));
+	tmp = (INT_GET(hdr->count, ARCH_CONVERT)-1) * (uint)sizeof(xfs_dir_leaf_entry_t)
+			+ (uint)sizeof(xfs_dir_leaf_hdr_t);
+	map = &hdr->freemap[0];
+	for (i = 0; i < XFS_DIR_LEAF_MAPSIZE; map++, i++) {
+		if (INT_GET(map->base, ARCH_CONVERT) == tmp) {
+			INT_MOD(map->base, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_entry_t));
+			INT_MOD(map->size, ARCH_CONVERT, -((uint)sizeof(xfs_dir_leaf_entry_t)));
+		}
+	}
+	INT_MOD(hdr->namebytes, ARCH_CONVERT, args->namelen);
+	xfs_da_log_buf(args->trans, bp,
+		XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
+}
+
+/*
+ * Garbage collect a leaf directory block by copying it to a new buffer.
+ */
+STATIC int
+xfs_dir_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp, int musthave,
+		     int justcheck)
+{
+	xfs_dir_leafblock_t *leaf_s, *leaf_d;
+	xfs_dir_leaf_hdr_t *hdr_s, *hdr_d;
+	xfs_mount_t *mp;
+	char *tmpbuffer;
+	char *tmpbuffer2=NULL;
+	int rval;
+	int lbsize;
+
+	mp = trans->t_mountp;
+	lbsize = XFS_LBSIZE(mp);
+	tmpbuffer = kmem_alloc(lbsize, KM_SLEEP);
+	ASSERT(tmpbuffer != NULL);
+	bcopy(bp->data, tmpbuffer, lbsize);
+
+	/*
+	 * Make a second copy in case xfs_dir_leaf_moveents()
+	 * below destroys the original.
+	 */
+	if (musthave || justcheck) {
+		tmpbuffer2 = kmem_alloc(lbsize, KM_SLEEP);
+		bcopy(bp->data, tmpbuffer2, lbsize);
+	}
+	bzero(bp->data, lbsize);
+
+	/*
+	 * Copy basic information
+	 */
+	leaf_s = (xfs_dir_leafblock_t *)tmpbuffer;
+	leaf_d = bp->data;
+	hdr_s = &leaf_s->hdr;
+	hdr_d = &leaf_d->hdr;
+	hdr_d->info = hdr_s->info;	/* struct copy */
+	INT_SET(hdr_d->firstused, ARCH_CONVERT, lbsize);
+	if (INT_ISZERO(hdr_d->firstused, ARCH_CONVERT))
+		INT_SET(hdr_d->firstused, ARCH_CONVERT, lbsize - 1);
+	INT_ZERO(hdr_d->namebytes, ARCH_CONVERT);
+	INT_ZERO(hdr_d->count, ARCH_CONVERT);
+	hdr_d->holes = 0;
+	INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT, sizeof(xfs_dir_leaf_hdr_t));
+	INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT, INT_GET(hdr_d->firstused, ARCH_CONVERT) - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
+
+	/*
+	 * Copy all entry's in the same (sorted) order,
+	 * but allocate filenames packed and in sequence.
+	 * This changes the source (leaf_s) as well.
+	 */
+	xfs_dir_leaf_moveents(leaf_s, 0, leaf_d, 0, (int)INT_GET(hdr_s->count, ARCH_CONVERT), mp);
+
+	if (musthave && INT_GET(hdr_d->freemap[0].size, ARCH_CONVERT) < musthave)
+		rval = XFS_ERROR(ENOSPC);
+	else
+		rval = 0;
+
+	if (justcheck || rval == ENOSPC) {
+		ASSERT(tmpbuffer2);
+		bcopy(tmpbuffer2, bp->data, lbsize);
+	} else {
+		xfs_da_log_buf(trans, bp, 0, lbsize - 1);
+	}
+
+	kmem_free(tmpbuffer, lbsize);
+	if (musthave || justcheck)
+		kmem_free(tmpbuffer2, lbsize);
+	return(rval);
+}
+
+/*
+ * Redistribute the directory entries between two leaf nodes,
+ * taking into account the size of the new entry.
+ *
+ * NOTE: if new block is empty, then it will get the upper half of old block.
+ */
+STATIC void
+xfs_dir_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
+				      xfs_da_state_blk_t *blk2)
+{
+	xfs_da_state_blk_t *tmp_blk;
+	xfs_dir_leafblock_t *leaf1, *leaf2;
+	xfs_dir_leaf_hdr_t *hdr1, *hdr2;
+	int count, totallen, max, space, swap;
+
+	/*
+	 * Set up environment.
+	 */
+	ASSERT(blk1->magic == XFS_DIR_LEAF_MAGIC);
+	ASSERT(blk2->magic == XFS_DIR_LEAF_MAGIC);
+	leaf1 = blk1->bp->data;
+	leaf2 = blk2->bp->data;
+	ASSERT(INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	ASSERT(INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+
+	/*
+	 * Check ordering of blocks, reverse if it makes things simpler.
+	 */
+	swap = 0;
+	if (xfs_dir_leaf_order(blk1->bp, blk2->bp)) {
+		tmp_blk = blk1;
+		blk1 = blk2;
+		blk2 = tmp_blk;
+		leaf1 = blk1->bp->data;
+		leaf2 = blk2->bp->data;
+		swap = 1;
+	}
+	hdr1 = &leaf1->hdr;
+	hdr2 = &leaf2->hdr;
+
+	/*
+	 * Examine entries until we reduce the absolute difference in
+	 * byte usage between the two blocks to a minimum.  Then get
+	 * the direction to copy and the number of elements to move.
+	 */
+	state->inleaf = xfs_dir_leaf_figure_balance(state, blk1, blk2,
+							   &count, &totallen);
+	if (swap)
+		state->inleaf = !state->inleaf;
+
+	/*
+	 * Move any entries required from leaf to leaf:
+	 */
+	if (count < INT_GET(hdr1->count, ARCH_CONVERT)) {
+		/*
+		 * Figure the total bytes to be added to the destination leaf.
+		 */
+		count = INT_GET(hdr1->count, ARCH_CONVERT) - count;	/* number entries being moved */
+		space  = INT_GET(hdr1->namebytes, ARCH_CONVERT) - totallen;
+		space += count * ((uint)sizeof(xfs_dir_leaf_name_t)-1);
+		space += count * (uint)sizeof(xfs_dir_leaf_entry_t);
+
+		/*
+		 * leaf2 is the destination, compact it if it looks tight.
+		 */
+		max  = INT_GET(hdr2->firstused, ARCH_CONVERT) - (uint)sizeof(xfs_dir_leaf_hdr_t);
+		max -= INT_GET(hdr2->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
+		if (space > max) {
+			xfs_dir_leaf_compact(state->args->trans, blk2->bp,
+								 0, 0);
+		}
+
+		/*
+		 * Move high entries from leaf1 to low end of leaf2.
+		 */
+		xfs_dir_leaf_moveents(leaf1, INT_GET(hdr1->count, ARCH_CONVERT) - count,
+					     leaf2, 0, count, state->mp);
+
+		xfs_da_log_buf(state->args->trans, blk1->bp, 0,
+						   state->blocksize-1);
+		xfs_da_log_buf(state->args->trans, blk2->bp, 0,
+						   state->blocksize-1);
+
+	} else if (count > INT_GET(hdr1->count, ARCH_CONVERT)) {
+		/*
+		 * Figure the total bytes to be added to the destination leaf.
+		 */
+		count -= INT_GET(hdr1->count, ARCH_CONVERT);		/* number entries being moved */
+		space  = totallen - INT_GET(hdr1->namebytes, ARCH_CONVERT);
+		space += count * ((uint)sizeof(xfs_dir_leaf_name_t)-1);
+		space += count * (uint)sizeof(xfs_dir_leaf_entry_t);
+
+		/*
+		 * leaf1 is the destination, compact it if it looks tight.
+		 */
+		max  = INT_GET(hdr1->firstused, ARCH_CONVERT) - (uint)sizeof(xfs_dir_leaf_hdr_t);
+		max -= INT_GET(hdr1->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
+		if (space > max) {
+			xfs_dir_leaf_compact(state->args->trans, blk1->bp,
+								 0, 0);
+		}
+
+		/*
+		 * Move low entries from leaf2 to high end of leaf1.
+		 */
+		xfs_dir_leaf_moveents(leaf2, 0, leaf1, (int)INT_GET(hdr1->count, ARCH_CONVERT),
+					     count, state->mp);
+
+		xfs_da_log_buf(state->args->trans, blk1->bp, 0,
+						   state->blocksize-1);
+		xfs_da_log_buf(state->args->trans, blk2->bp, 0,
+						   state->blocksize-1);
+	}
+
+	/*
+	 * Copy out last hashval in each block for B-tree code.
+	 */
+	blk1->hashval = INT_GET(leaf1->entries[ INT_GET(leaf1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+	blk2->hashval = INT_GET(leaf2->entries[ INT_GET(leaf2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+
+	/*
+	 * Adjust the expected index for insertion.
+	 * GROT: this doesn't work unless blk2 was originally empty.
+	 */
+	if (!state->inleaf) {
+		blk2->index = blk1->index - INT_GET(leaf1->hdr.count, ARCH_CONVERT);
+	}
+}
+
+/*
+ * Examine entries until we reduce the absolute difference in
+ * byte usage between the two blocks to a minimum.
+ * GROT: Is this really necessary?  With other than a 512 byte blocksize,
+ * GROT: there will always be enough room in either block for a new entry.
+ * GROT: Do a double-split for this case?
+ */
+STATIC int
+xfs_dir_leaf_figure_balance(xfs_da_state_t *state,
+					   xfs_da_state_blk_t *blk1,
+					   xfs_da_state_blk_t *blk2,
+					   int *countarg, int *namebytesarg)
+{
+	xfs_dir_leafblock_t *leaf1, *leaf2;
+	xfs_dir_leaf_hdr_t *hdr1, *hdr2;
+	xfs_dir_leaf_entry_t *entry;
+	int count, max, totallen, half;
+	int lastdelta, foundit, tmp;
+
+	/*
+	 * Set up environment.
+	 */
+	leaf1 = blk1->bp->data;
+	leaf2 = blk2->bp->data;
+	hdr1 = &leaf1->hdr;
+	hdr2 = &leaf2->hdr;
+	foundit = 0;
+	totallen = 0;
+
+	/*
+	 * Examine entries until we reduce the absolute difference in
+	 * byte usage between the two blocks to a minimum.
+	 */
+	max = INT_GET(hdr1->count, ARCH_CONVERT) + INT_GET(hdr2->count, ARCH_CONVERT);
+	half  = (max+1) * (uint)(sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1);
+	half += INT_GET(hdr1->namebytes, ARCH_CONVERT) + INT_GET(hdr2->namebytes, ARCH_CONVERT) + state->args->namelen;
+	half /= 2;
+	lastdelta = state->blocksize;
+	entry = &leaf1->entries[0];
+	for (count = 0; count < max; entry++, count++) {
+
+#define XFS_DIR_ABS(A)	(((A) < 0) ? -(A) : (A))
+		/*
+		 * The new entry is in the first block, account for it.
+		 */
+		if (count == blk1->index) {
+			tmp = totallen + (uint)sizeof(*entry)
+				+ XFS_DIR_LEAF_ENTSIZE_BYNAME(state->args->namelen);
+			if (XFS_DIR_ABS(half - tmp) > lastdelta)
+				break;
+			lastdelta = XFS_DIR_ABS(half - tmp);
+			totallen = tmp;
+			foundit = 1;
+		}
+
+		/*
+		 * Wrap around into the second block if necessary.
+		 */
+		if (count == INT_GET(hdr1->count, ARCH_CONVERT)) {
+			leaf1 = leaf2;
+			entry = &leaf1->entries[0];
+		}
+
+		/*
+		 * Figure out if next leaf entry would be too much.
+		 */
+		tmp = totallen + (uint)sizeof(*entry)
+				+ XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry);
+		if (XFS_DIR_ABS(half - tmp) > lastdelta)
+			break;
+		lastdelta = XFS_DIR_ABS(half - tmp);
+		totallen = tmp;
+#undef XFS_DIR_ABS
+	}
+
+	/*
+	 * Calculate the number of namebytes that will end up in lower block.
+	 * If new entry not in lower block, fix up the count.
+	 */
+	totallen -=
+		count * (uint)(sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1);
+	if (foundit) {
+		totallen -= (sizeof(*entry)+sizeof(xfs_dir_leaf_entry_t)-1) +
+			    state->args->namelen;
+	}
+
+	*countarg = count;
+	*namebytesarg = totallen;
+	return(foundit);
+}
+
+/*========================================================================
+ * Routines used for shrinking the Btree.
+ *========================================================================*/
+
+/*
+ * Check a leaf block and its neighbors to see if the block should be
+ * collapsed into one or the other neighbor.  Always keep the block
+ * with the smaller block number.
+ * If the current block is over 50% full, don't try to join it, return 0.
+ * If the block is empty, fill in the state structure and return 2.
+ * If it can be collapsed, fill in the state structure and return 1.
+ * If nothing can be done, return 0.
+ */
+int
+xfs_dir_leaf_toosmall(xfs_da_state_t *state, int *action)
+{
+	xfs_dir_leafblock_t *leaf;
+	xfs_da_state_blk_t *blk;
+	xfs_da_blkinfo_t *info;
+	int count, bytes, forward, error, retval, i;
+	xfs_dablk_t blkno;
+	xfs_dabuf_t *bp;
+
+	/*
+	 * Check for the degenerate case of the block being over 50% full.
+	 * If so, it's not worth even looking to see if we might be able
+	 * to coalesce with a sibling.
+	 */
+	blk = &state->path.blk[ state->path.active-1 ];
+	info = blk->bp->data;
+	ASSERT(INT_GET(info->magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	leaf = (xfs_dir_leafblock_t *)info;
+	count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+	bytes = (uint)sizeof(xfs_dir_leaf_hdr_t) +
+		count * (uint)sizeof(xfs_dir_leaf_entry_t) +
+		count * ((uint)sizeof(xfs_dir_leaf_name_t)-1) +
+		INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
+	if (bytes > (state->blocksize >> 1)) {
+		*action = 0;	/* blk over 50%, dont try to join */
+		return(0);
+	}
+
+	/*
+	 * Check for the degenerate case of the block being empty.
+	 * If the block is empty, we'll simply delete it, no need to
+	 * coalesce it with a sibling block.  We choose (aribtrarily)
+	 * to merge with the forward block unless it is NULL.
+	 */
+	if (count == 0) {
+		/*
+		 * Make altpath point to the block we want to keep and
+		 * path point to the block we want to drop (this one).
+		 */
+		forward = !INT_ISZERO(info->forw, ARCH_CONVERT);
+		bcopy(&state->path, &state->altpath, sizeof(state->path));
+		error = xfs_da_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+		if (error)
+			return(error);
+		if (retval) {
+			*action = 0;
+		} else {
+			*action = 2;
+		}
+		return(0);
+	}
+
+	/*
+	 * Examine each sibling block to see if we can coalesce with
+	 * at least 25% free space to spare.  We need to figure out
+	 * whether to merge with the forward or the backward block.
+	 * We prefer coalescing with the lower numbered sibling so as
+	 * to shrink a directory over time.
+	 */
+	forward = (INT_GET(info->forw, ARCH_CONVERT) < INT_GET(info->back, ARCH_CONVERT));	/* start with smaller blk num */
+	for (i = 0; i < 2; forward = !forward, i++) {
+		if (forward)
+			blkno = INT_GET(info->forw, ARCH_CONVERT);
+		else
+			blkno = INT_GET(info->back, ARCH_CONVERT);
+		if (blkno == 0)
+			continue;
+		error = xfs_da_read_buf(state->args->trans, state->args->dp,
+							    blkno, -1, &bp,
+							    XFS_DATA_FORK);
+		if (error)
+			return(error);
+		ASSERT(bp != NULL);
+
+		leaf = (xfs_dir_leafblock_t *)info;
+		count  = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+		bytes  = state->blocksize - (state->blocksize>>2);
+		bytes -= INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
+		leaf = bp->data;
+		ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+		count += INT_GET(leaf->hdr.count, ARCH_CONVERT);
+		bytes -= INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
+		bytes -= count * ((uint)sizeof(xfs_dir_leaf_name_t) - 1);
+		bytes -= count * (uint)sizeof(xfs_dir_leaf_entry_t);
+		bytes -= (uint)sizeof(xfs_dir_leaf_hdr_t);
+		if (bytes >= 0)
+			break;	/* fits with at least 25% to spare */
+
+		xfs_da_brelse(state->args->trans, bp);
+	}
+	if (i >= 2) {
+		*action = 0;
+		return(0);
+	}
+	xfs_da_buf_done(bp);
+
+	/*
+	 * Make altpath point to the block we want to keep (the lower
+	 * numbered block) and path point to the block we want to drop.
+	 */
+	bcopy(&state->path, &state->altpath, sizeof(state->path));
+	if (blkno < blk->blkno) {
+		error = xfs_da_path_shift(state, &state->altpath, forward,
+						 0, &retval);
+	} else {
+		error = xfs_da_path_shift(state, &state->path, forward,
+						 0, &retval);
+	}
+	if (error)
+		return(error);
+	if (retval) {
+		*action = 0;
+	} else {
+		*action = 1;
+	}
+	return(0);
+}
+
+/*
+ * Remove a name from the leaf directory structure.
+ *
+ * Return 1 if leaf is less than 37% full, 0 if >= 37% full.
+ * If two leaves are 37% full, when combined they will leave 25% free.
+ */
+int
+xfs_dir_leaf_remove(xfs_trans_t *trans, xfs_dabuf_t *bp, int index)
+{
+	xfs_dir_leafblock_t *leaf;
+	xfs_dir_leaf_hdr_t *hdr;
+	xfs_dir_leaf_map_t *map;
+	xfs_dir_leaf_entry_t *entry;
+	xfs_dir_leaf_name_t *namest;
+	int before, after, smallest, entsize;
+	int tablesize, tmp, i;
+	xfs_mount_t *mp;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	hdr = &leaf->hdr;
+	mp = trans->t_mountp;
+	ASSERT((INT_GET(hdr->count, ARCH_CONVERT) > 0) && (INT_GET(hdr->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)));
+	ASSERT((index >= 0) && (index < INT_GET(hdr->count, ARCH_CONVERT)));
+	ASSERT(INT_GET(hdr->firstused, ARCH_CONVERT) >= ((INT_GET(hdr->count, ARCH_CONVERT)*sizeof(*entry))+sizeof(*hdr)));
+	entry = &leaf->entries[index];
+	ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) >= INT_GET(hdr->firstused, ARCH_CONVERT));
+	ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp));
+
+	/*
+	 * Scan through free region table:
+	 *    check for adjacency of free'd entry with an existing one,
+	 *    find smallest free region in case we need to replace it,
+	 *    adjust any map that borders the entry table,
+	 */
+	tablesize = INT_GET(hdr->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t)
+			+ (uint)sizeof(xfs_dir_leaf_hdr_t);
+	map = &hdr->freemap[0];
+	tmp = INT_GET(map->size, ARCH_CONVERT);
+	before = after = -1;
+	smallest = XFS_DIR_LEAF_MAPSIZE - 1;
+	entsize = XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry);
+	for (i = 0; i < XFS_DIR_LEAF_MAPSIZE; map++, i++) {
+		ASSERT(INT_GET(map->base, ARCH_CONVERT) < XFS_LBSIZE(mp));
+		ASSERT(INT_GET(map->size, ARCH_CONVERT) < XFS_LBSIZE(mp));
+		if (INT_GET(map->base, ARCH_CONVERT) == tablesize) {
+			INT_MOD(map->base, ARCH_CONVERT, -((uint)sizeof(xfs_dir_leaf_entry_t)));
+			INT_MOD(map->size, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_entry_t));
+		}
+
+		if ((INT_GET(map->base, ARCH_CONVERT) + INT_GET(map->size, ARCH_CONVERT)) == INT_GET(entry->nameidx, ARCH_CONVERT)) {
+			before = i;
+		} else if (INT_GET(map->base, ARCH_CONVERT) == (INT_GET(entry->nameidx, ARCH_CONVERT) + entsize)) {
+			after = i;
+		} else if (INT_GET(map->size, ARCH_CONVERT) < tmp) {
+			tmp = INT_GET(map->size, ARCH_CONVERT);
+			smallest = i;
+		}
+	}
+
+	/*
+	 * Coalesce adjacent freemap regions,
+	 * or replace the smallest region.
+	 */
+	if ((before >= 0) || (after >= 0)) {
+		if ((before >= 0) && (after >= 0)) {
+			map = &hdr->freemap[before];
+			INT_MOD(map->size, ARCH_CONVERT, entsize);
+			INT_MOD(map->size, ARCH_CONVERT, INT_GET(hdr->freemap[after].size, ARCH_CONVERT));
+			INT_ZERO(hdr->freemap[after].base, ARCH_CONVERT);
+			INT_ZERO(hdr->freemap[after].size, ARCH_CONVERT);
+		} else if (before >= 0) {
+			map = &hdr->freemap[before];
+			INT_MOD(map->size, ARCH_CONVERT, entsize);
+		} else {
+			map = &hdr->freemap[after];
+			INT_COPY(map->base, entry->nameidx, ARCH_CONVERT);
+			INT_MOD(map->size, ARCH_CONVERT, entsize);
+		}
+	} else {
+		/*
+		 * Replace smallest region (if it is smaller than free'd entry)
+		 */
+		map = &hdr->freemap[smallest];
+		if (INT_GET(map->size, ARCH_CONVERT) < entsize) {
+			INT_COPY(map->base, entry->nameidx, ARCH_CONVERT);
+			INT_SET(map->size, ARCH_CONVERT, entsize);
+		}
+	}
+
+	/*
+	 * Did we remove the first entry?
+	 */
+	if (INT_GET(entry->nameidx, ARCH_CONVERT) == INT_GET(hdr->firstused, ARCH_CONVERT))
+		smallest = 1;
+	else
+		smallest = 0;
+
+	/*
+	 * Compress the remaining entries and zero out the removed stuff.
+	 */
+	namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+	bzero((char *)namest, entsize);
+	xfs_da_log_buf(trans, bp, XFS_DA_LOGRANGE(leaf, namest, entsize));
+
+	INT_MOD(hdr->namebytes, ARCH_CONVERT, -(entry->namelen));
+	tmp = (INT_GET(hdr->count, ARCH_CONVERT) - index) * (uint)sizeof(xfs_dir_leaf_entry_t);
+	ovbcopy(entry + 1, entry, tmp);
+	INT_MOD(hdr->count, ARCH_CONVERT, -1);
+	xfs_da_log_buf(trans, bp,
+	    XFS_DA_LOGRANGE(leaf, entry, tmp + (uint)sizeof(*entry)));
+	entry = &leaf->entries[INT_GET(hdr->count, ARCH_CONVERT)];
+	bzero((char *)entry, sizeof(xfs_dir_leaf_entry_t));
+
+	/*
+	 * If we removed the first entry, re-find the first used byte
+	 * in the name area.  Note that if the entry was the "firstused",
+	 * then we don't have a "hole" in our block resulting from
+	 * removing the name.
+	 */
+	if (smallest) {
+		tmp = XFS_LBSIZE(mp);
+		entry = &leaf->entries[0];
+		for (i = INT_GET(hdr->count, ARCH_CONVERT)-1; i >= 0; entry++, i--) {
+			ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) >= INT_GET(hdr->firstused, ARCH_CONVERT));
+			ASSERT(INT_GET(entry->nameidx, ARCH_CONVERT) < XFS_LBSIZE(mp));
+			if (INT_GET(entry->nameidx, ARCH_CONVERT) < tmp)
+				tmp = INT_GET(entry->nameidx, ARCH_CONVERT);
+		}
+		INT_SET(hdr->firstused, ARCH_CONVERT, tmp);
+		if (INT_ISZERO(hdr->firstused, ARCH_CONVERT))
+			INT_SET(hdr->firstused, ARCH_CONVERT, tmp - 1);
+	} else {
+		hdr->holes = 1;		/* mark as needing compaction */
+	}
+
+	xfs_da_log_buf(trans, bp, XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr)));
+
+	/*
+	 * Check if leaf is less than 50% full, caller may want to
+	 * "join" the leaf with a sibling if so.
+	 */
+	tmp  = (uint)sizeof(xfs_dir_leaf_hdr_t);
+	tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t);
+	tmp += INT_GET(leaf->hdr.count, ARCH_CONVERT) * ((uint)sizeof(xfs_dir_leaf_name_t) - 1);
+	tmp += INT_GET(leaf->hdr.namebytes, ARCH_CONVERT);
+	if (tmp < mp->m_dir_magicpct)
+		return(1);			/* leaf is < 37% full */
+	return(0);
+}
+
+/*
+ * Move all the directory entries from drop_leaf into save_leaf.
+ */
+void
+xfs_dir_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
+				      xfs_da_state_blk_t *save_blk)
+{
+	xfs_dir_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf;
+	xfs_dir_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr;
+	xfs_mount_t *mp;
+	char *tmpbuffer;
+
+	/*
+	 * Set up environment.
+	 */
+	mp = state->mp;
+	ASSERT(drop_blk->magic == XFS_DIR_LEAF_MAGIC);
+	ASSERT(save_blk->magic == XFS_DIR_LEAF_MAGIC);
+	drop_leaf = drop_blk->bp->data;
+	save_leaf = save_blk->bp->data;
+	ASSERT(INT_GET(drop_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	ASSERT(INT_GET(save_leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	drop_hdr = &drop_leaf->hdr;
+	save_hdr = &save_leaf->hdr;
+
+	/*
+	 * Save last hashval from dying block for later Btree fixup.
+	 */
+	drop_blk->hashval = INT_GET(drop_leaf->entries[ drop_leaf->hdr.count-1 ].hashval, ARCH_CONVERT);
+
+	/*
+	 * Check if we need a temp buffer, or can we do it in place.
+	 * Note that we don't check "leaf" for holes because we will
+	 * always be dropping it, toosmall() decided that for us already.
+	 */
+	if (save_hdr->holes == 0) {
+		/*
+		 * dest leaf has no holes, so we add there.  May need
+		 * to make some room in the entry array.
+		 */
+		if (xfs_dir_leaf_order(save_blk->bp, drop_blk->bp)) {
+			xfs_dir_leaf_moveents(drop_leaf, 0, save_leaf, 0,
+						 (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
+		} else {
+			xfs_dir_leaf_moveents(drop_leaf, 0,
+					      save_leaf, INT_GET(save_hdr->count, ARCH_CONVERT),
+					      (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
+		}
+	} else {
+		/*
+		 * Destination has holes, so we make a temporary copy
+		 * of the leaf and add them both to that.
+		 */
+		tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP);
+		ASSERT(tmpbuffer != NULL);
+		bzero(tmpbuffer, state->blocksize);
+		tmp_leaf = (xfs_dir_leafblock_t *)tmpbuffer;
+		tmp_hdr = &tmp_leaf->hdr;
+		tmp_hdr->info = save_hdr->info; /* struct copy */
+		INT_ZERO(tmp_hdr->count, ARCH_CONVERT);
+		INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize);
+		if (INT_ISZERO(tmp_hdr->firstused, ARCH_CONVERT))
+			INT_SET(tmp_hdr->firstused, ARCH_CONVERT, state->blocksize - 1);
+		INT_ZERO(tmp_hdr->namebytes, ARCH_CONVERT);
+		if (xfs_dir_leaf_order(save_blk->bp, drop_blk->bp)) {
+			xfs_dir_leaf_moveents(drop_leaf, 0, tmp_leaf, 0,
+						 (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
+			xfs_dir_leaf_moveents(save_leaf, 0,
+					      tmp_leaf, INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
+					      (int)INT_GET(save_hdr->count, ARCH_CONVERT), mp);
+		} else {
+			xfs_dir_leaf_moveents(save_leaf, 0, tmp_leaf, 0,
+						 (int)INT_GET(save_hdr->count, ARCH_CONVERT), mp);
+			xfs_dir_leaf_moveents(drop_leaf, 0,
+					      tmp_leaf, INT_GET(tmp_leaf->hdr.count, ARCH_CONVERT),
+					      (int)INT_GET(drop_hdr->count, ARCH_CONVERT), mp);
+		}
+		bcopy(tmp_leaf, save_leaf, state->blocksize);
+		kmem_free(tmpbuffer, state->blocksize);
+	}
+
+	xfs_da_log_buf(state->args->trans, save_blk->bp, 0,
+					   state->blocksize - 1);
+
+	/*
+	 * Copy out last hashval in each block for B-tree code.
+	 */
+	save_blk->hashval = INT_GET(save_leaf->entries[ INT_GET(save_leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT);
+}
+
+/*========================================================================
+ * Routines used for finding things in the Btree.
+ *========================================================================*/
+
+/*
+ * Look up a name in a leaf directory structure.
+ * This is the internal routine, it uses the caller's buffer.
+ *
+ * Note that duplicate keys are allowed, but only check within the
+ * current leaf node.  The Btree code must check in adjacent leaf nodes.
+ *
+ * Return in *index the index into the entry[] array of either the found
+ * entry, or where the entry should have been (insert before that entry).
+ *
+ * Don't change the args->inumber unless we find the filename.
+ */
+int
+xfs_dir_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args, int *index)
+{
+	xfs_dir_leafblock_t *leaf;
+	xfs_dir_leaf_entry_t *entry;
+	xfs_dir_leaf_name_t *namest;
+	int probe, span;
+	xfs_dahash_t hashval;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	ASSERT(INT_GET(leaf->hdr.count, ARCH_CONVERT) < (XFS_LBSIZE(args->dp->i_mount)/8));
+
+	/*
+	 * Binary search.  (note: small blocks will skip this loop)
+	 */
+	hashval = args->hashval;
+	probe = span = INT_GET(leaf->hdr.count, ARCH_CONVERT) / 2;
+	for (entry = &leaf->entries[probe]; span > 4;
+		   entry = &leaf->entries[probe]) {
+		span /= 2;
+		if (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)
+			probe += span;
+		else if (INT_GET(entry->hashval, ARCH_CONVERT) > hashval)
+			probe -= span;
+		else
+			break;
+	}
+	ASSERT((probe >= 0) && \
+	       ((INT_ISZERO(leaf->hdr.count, ARCH_CONVERT)) || (probe < INT_GET(leaf->hdr.count, ARCH_CONVERT))));
+	ASSERT((span <= 4) || (INT_GET(entry->hashval, ARCH_CONVERT) == hashval));
+
+	/*
+	 * Since we may have duplicate hashval's, find the first matching
+	 * hashval in the leaf.
+	 */
+	while ((probe > 0) && (INT_GET(entry->hashval, ARCH_CONVERT) >= hashval)) {
+		entry--;
+		probe--;
+	}
+	while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)) && (INT_GET(entry->hashval, ARCH_CONVERT) < hashval)) {
+		entry++;
+		probe++;
+	}
+	if ((probe == INT_GET(leaf->hdr.count, ARCH_CONVERT)) || (INT_GET(entry->hashval, ARCH_CONVERT) != hashval)) {
+		*index = probe;
+		ASSERT(args->oknoent);
+		return(XFS_ERROR(ENOENT));
+	}
+
+	/*
+	 * Duplicate keys may be present, so search all of them for a match.
+	 */
+	while ((probe < INT_GET(leaf->hdr.count, ARCH_CONVERT)) && (INT_GET(entry->hashval, ARCH_CONVERT) == hashval)) {
+		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf, INT_GET(entry->nameidx, ARCH_CONVERT));
+		if (entry->namelen == args->namelen &&
+		    namest->name[0] == args->name[0] &&
+		    bcmp(args->name, namest->name, args->namelen) == 0) {
+			XFS_DIR_SF_GET_DIRINO_ARCH(&namest->inumber, &args->inumber, ARCH_CONVERT);
+			*index = probe;
+			return(XFS_ERROR(EEXIST));
+		}
+		entry++;
+		probe++;
+	}
+	*index = probe;
+	ASSERT(probe == INT_GET(leaf->hdr.count, ARCH_CONVERT) || args->oknoent);
+	return(XFS_ERROR(ENOENT));
+}
+
+/*========================================================================
+ * Utility routines.
+ *========================================================================*/
+
+/*
+ * Move the indicated entries from one leaf to another.
+ * NOTE: this routine modifies both source and destination leaves.
+ */
+/* ARGSUSED */
+STATIC void
+xfs_dir_leaf_moveents(xfs_dir_leafblock_t *leaf_s, int start_s,
+		      xfs_dir_leafblock_t *leaf_d, int start_d,
+		      int count, xfs_mount_t *mp)
+{
+	xfs_dir_leaf_hdr_t *hdr_s, *hdr_d;
+	xfs_dir_leaf_entry_t *entry_s, *entry_d;
+	int tmp, i;
+
+	/*
+	 * Check for nothing to do.
+	 */
+	if (count == 0)
+		return;
+
+	/*
+	 * Set up environment.
+	 */
+	ASSERT(INT_GET(leaf_s->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	ASSERT(INT_GET(leaf_d->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	hdr_s = &leaf_s->hdr;
+	hdr_d = &leaf_d->hdr;
+	ASSERT((INT_GET(hdr_s->count, ARCH_CONVERT) > 0) && (INT_GET(hdr_s->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8)));
+	ASSERT(INT_GET(hdr_s->firstused, ARCH_CONVERT) >=
+		((INT_GET(hdr_s->count, ARCH_CONVERT)*sizeof(*entry_s))+sizeof(*hdr_s)));
+	ASSERT(INT_GET(hdr_d->count, ARCH_CONVERT) < (XFS_LBSIZE(mp)/8));
+	ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >=
+		((INT_GET(hdr_d->count, ARCH_CONVERT)*sizeof(*entry_d))+sizeof(*hdr_d)));
+
+	ASSERT(start_s < INT_GET(hdr_s->count, ARCH_CONVERT));
+	ASSERT(start_d <= INT_GET(hdr_d->count, ARCH_CONVERT));
+	ASSERT(count <= INT_GET(hdr_s->count, ARCH_CONVERT));
+
+	/*
+	 * Move the entries in the destination leaf up to make a hole?
+	 */
+	if (start_d < INT_GET(hdr_d->count, ARCH_CONVERT)) {
+		tmp  = INT_GET(hdr_d->count, ARCH_CONVERT) - start_d;
+		tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
+		entry_s = &leaf_d->entries[start_d];
+		entry_d = &leaf_d->entries[start_d + count];
+		bcopy(entry_s, entry_d, tmp);
+	}
+
+	/*
+	 * Copy all entry's in the same (sorted) order,
+	 * but allocate filenames packed and in sequence.
+	 */
+	entry_s = &leaf_s->entries[start_s];
+	entry_d = &leaf_d->entries[start_d];
+	for (i = 0; i < count; entry_s++, entry_d++, i++) {
+		ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) >= INT_GET(hdr_s->firstused, ARCH_CONVERT));
+		ASSERT(entry_s->namelen < MAXNAMELEN);
+		tmp = XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry_s);
+		INT_MOD(hdr_d->firstused, ARCH_CONVERT, -(tmp));
+		entry_d->hashval = entry_s->hashval; /* INT_: direct copy */
+		INT_COPY(entry_d->nameidx, hdr_d->firstused, ARCH_CONVERT);
+		entry_d->namelen = entry_s->namelen;
+		ASSERT(INT_GET(entry_d->nameidx, ARCH_CONVERT) + tmp <= XFS_LBSIZE(mp));
+		bcopy(XFS_DIR_LEAF_NAMESTRUCT(leaf_s, INT_GET(entry_s->nameidx, ARCH_CONVERT)),
+		      XFS_DIR_LEAF_NAMESTRUCT(leaf_d, INT_GET(entry_d->nameidx, ARCH_CONVERT)), tmp);
+		ASSERT(INT_GET(entry_s->nameidx, ARCH_CONVERT) + tmp <= XFS_LBSIZE(mp));
+		bzero((char *)XFS_DIR_LEAF_NAMESTRUCT(leaf_s, INT_GET(entry_s->nameidx, ARCH_CONVERT)),
+		      tmp);
+		INT_MOD(hdr_s->namebytes, ARCH_CONVERT, -(entry_d->namelen));
+		INT_MOD(hdr_d->namebytes, ARCH_CONVERT, entry_d->namelen);
+		INT_MOD(hdr_s->count, ARCH_CONVERT, -1);
+		INT_MOD(hdr_d->count, ARCH_CONVERT, +1);
+		tmp  = INT_GET(hdr_d->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t)
+				+ (uint)sizeof(xfs_dir_leaf_hdr_t);
+		ASSERT(INT_GET(hdr_d->firstused, ARCH_CONVERT) >= tmp);
+
+	}
+
+	/*
+	 * Zero out the entries we just copied.
+	 */
+	if (start_s == INT_GET(hdr_s->count, ARCH_CONVERT)) {
+		tmp = count * (uint)sizeof(xfs_dir_leaf_entry_t);
+		entry_s = &leaf_s->entries[start_s];
+		ASSERT((char *)entry_s + tmp <= (char *)leaf_s + XFS_LBSIZE(mp));
+		bzero((char *)entry_s, tmp);
+	} else {
+		/*
+		 * Move the remaining entries down to fill the hole,
+		 * then zero the entries at the top.
+		 */
+		tmp  = INT_GET(hdr_s->count, ARCH_CONVERT) - count;
+		tmp *= (uint)sizeof(xfs_dir_leaf_entry_t);
+		entry_s = &leaf_s->entries[start_s + count];
+		entry_d = &leaf_s->entries[start_s];
+		bcopy(entry_s, entry_d, tmp);
+
+		tmp = count * (uint)sizeof(xfs_dir_leaf_entry_t);
+		entry_s = &leaf_s->entries[INT_GET(hdr_s->count, ARCH_CONVERT)];
+		ASSERT((char *)entry_s + tmp <= (char *)leaf_s + XFS_LBSIZE(mp));
+		bzero((char *)entry_s, tmp);
+	}
+
+	/*
+	 * Fill in the freemap information
+	 */
+	INT_SET(hdr_d->freemap[0].base, ARCH_CONVERT, (uint)sizeof(xfs_dir_leaf_hdr_t));
+	INT_MOD(hdr_d->freemap[0].base, ARCH_CONVERT, INT_GET(hdr_d->count, ARCH_CONVERT) * (uint)sizeof(xfs_dir_leaf_entry_t));
+	INT_SET(hdr_d->freemap[0].size, ARCH_CONVERT, INT_GET(hdr_d->firstused, ARCH_CONVERT) - INT_GET(hdr_d->freemap[0].base, ARCH_CONVERT));
+	INT_SET(hdr_d->freemap[1].base, ARCH_CONVERT, INT_ZERO(hdr_d->freemap[2].base, ARCH_CONVERT));
+	INT_SET(hdr_d->freemap[1].size, ARCH_CONVERT, INT_ZERO(hdr_d->freemap[2].size, ARCH_CONVERT));
+	hdr_s->holes = 1;	/* leaf may not be compact */
+}
+
+/*
+ * Compare two leaf blocks "order".
+ */
+int
+xfs_dir_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp)
+{
+	xfs_dir_leafblock_t *leaf1, *leaf2;
+
+	leaf1 = leaf1_bp->data;
+	leaf2 = leaf2_bp->data;
+	ASSERT((INT_GET(leaf1->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC) &&
+	       (INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC));
+	if ((INT_GET(leaf1->hdr.count, ARCH_CONVERT) > 0) && (INT_GET(leaf2->hdr.count, ARCH_CONVERT) > 0) &&
+	    ((INT_GET(leaf2->entries[ 0 ].hashval, ARCH_CONVERT) <
+	      INT_GET(leaf1->entries[ 0 ].hashval, ARCH_CONVERT)) ||
+	     (INT_GET(leaf2->entries[ INT_GET(leaf2->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT) <
+	      INT_GET(leaf1->entries[ INT_GET(leaf1->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT)))) {
+		return(1);
+	}
+	return(0);
+}
+
+/*
+ * Pick up the last hashvalue from a leaf block.
+ */
+xfs_dahash_t
+xfs_dir_leaf_lasthash(xfs_dabuf_t *bp, int *count)
+{
+	xfs_dir_leafblock_t *leaf;
+
+	leaf = bp->data;
+	ASSERT(INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) == XFS_DIR_LEAF_MAGIC);
+	if (count)
+		*count = INT_GET(leaf->hdr.count, ARCH_CONVERT);
+	if (INT_ISZERO(leaf->hdr.count, ARCH_CONVERT))
+		return(0);
+	return(INT_GET(leaf->entries[ INT_GET(leaf->hdr.count, ARCH_CONVERT)-1 ].hashval, ARCH_CONVERT));
+}
+
+/*
+ * Copy out directory entries for getdents(), for leaf directories.
+ */
+int
+xfs_dir_leaf_getdents_int(
+	xfs_dabuf_t	*bp,
+	xfs_inode_t	*dp,
+	xfs_dablk_t	bno,
+	uio_t		*uio,
+	int		*eobp,
+	xfs_dirent_t	*dbp,
+	xfs_dir_put_t	put,
+	xfs_daddr_t		nextda)
+{
+	xfs_dir_leafblock_t	*leaf;
+	xfs_dir_leaf_entry_t	*entry;
+	xfs_dir_leaf_name_t	*namest;
+	int			entno, want_entno, i, nextentno;
+	xfs_mount_t		*mp;
+	xfs_dahash_t		cookhash;
+	xfs_dahash_t		nexthash = 0;
+#if (BITS_PER_LONG == 32)
+	xfs_dahash_t		lasthash = XFS_DA_MAXHASH;
+#endif
+	xfs_dir_put_args_t	p;
+
+	mp = dp->i_mount;
+	leaf = bp->data;
+	if (INT_GET(leaf->hdr.info.magic, ARCH_CONVERT) != XFS_DIR_LEAF_MAGIC) {
+		*eobp = 1;
+		return(XFS_ERROR(ENOENT));	/* XXX wrong code */
+	}
+
+	want_entno = XFS_DA_COOKIE_ENTRY(mp, uio->uio_offset);
+
+	cookhash = XFS_DA_COOKIE_HASH(mp, uio->uio_offset);
+
+	xfs_dir_trace_g_dul("leaf: start", dp, uio, leaf);
+
+	/*
+	 * Re-find our place.
+	 */
+	for (i = entno = 0, entry = &leaf->entries[0];
+		     i < INT_GET(leaf->hdr.count, ARCH_CONVERT);
+			     entry++, i++) {
+
+		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf,
+				    INT_GET(entry->nameidx, ARCH_CONVERT));
+
+		if (((char *)namest < (char *)leaf) ||
+		    ((char *)namest >= (char *)leaf + XFS_LBSIZE(mp)) ||
+		    (entry->namelen >= MAXNAMELEN)) {
+			xfs_dir_trace_g_du("leaf: corrupted", dp, uio);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+		if (INT_GET(entry->hashval, ARCH_CONVERT) >= cookhash) {
+			if (   entno < want_entno
+			    && INT_GET(entry->hashval, ARCH_CONVERT)
+							== cookhash) {
+				/*
+				 * Trying to get to a particular offset in a
+				 * run of equal-hashval entries.
+				 */
+				entno++;
+			} else if (   want_entno > 0
+				   && entno == want_entno
+				   && INT_GET(entry->hashval, ARCH_CONVERT)
+							== cookhash) {
+				break;
+			} else {
+				entno = 0;
+				break;
+			}
+		}
+	}
+
+	if (i == INT_GET(leaf->hdr.count, ARCH_CONVERT)) {
+		xfs_dir_trace_g_du("leaf: hash not found", dp, uio);
+		if (!INT_GET(leaf->hdr.info.forw, ARCH_CONVERT))
+			uio->uio_offset =
+				XFS_DA_MAKE_COOKIE(mp, 0, 0, XFS_DA_MAXHASH);
+		/*
+		 * Don't set uio_offset if there's another block:
+		 * the node code will be setting uio_offset anyway.
+		 */
+		*eobp = 0;
+		return(0);
+	}
+	xfs_dir_trace_g_due("leaf: hash found", dp, uio, entry);
+
+	p.dbp = dbp;
+	p.put = put;
+	p.uio = uio;
+
+	/*
+	 * We're synchronized, start copying entries out to the user.
+	 */
+	for (; entno >= 0 && i < INT_GET(leaf->hdr.count, ARCH_CONVERT);
+			     entry++, i++, (entno = nextentno)) {
+		int lastresid=0, retval;
+		xfs_dircook_t lastoffset;
+		xfs_dahash_t thishash;
+
+		/*
+		 * Check for a damaged directory leaf block and pick up
+		 * the inode number from this entry.
+		 */
+		namest = XFS_DIR_LEAF_NAMESTRUCT(leaf,
+				    INT_GET(entry->nameidx, ARCH_CONVERT));
+
+		if (((char *)namest < (char *)leaf) ||
+		    ((char *)namest >= (char *)leaf + XFS_LBSIZE(mp)) ||
+		    (entry->namelen >= MAXNAMELEN)) {
+			xfs_dir_trace_g_du("leaf: corrupted", dp, uio);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+
+		thishash = INT_GET(entry->hashval, ARCH_CONVERT);
+
+		/*
+		 * NOTE! Linux "filldir" semantics require that the
+		 *	 offset "cookie" be for this entry, not the
+		 *	 next; all the actual shuffling to make it
+		 *	 "look right" to the user is done in filldir.
+		 */
+		XFS_PUT_COOKIE(p.cook, mp, bno, entno, thishash);
+
+		xfs_dir_trace_g_duc("leaf: middle cookie  ",
+						   dp, uio, p.cook.o);
+
+		if (i < (INT_GET(leaf->hdr.count, ARCH_CONVERT) - 1)) {
+			nexthash = INT_GET(entry[1].hashval, ARCH_CONVERT);
+
+			if (nexthash == INT_GET(entry->hashval, ARCH_CONVERT))
+				nextentno = entno + 1;
+			else
+				nextentno = 0;
+
+		} else if (INT_GET(leaf->hdr.info.forw, ARCH_CONVERT)) {
+			xfs_dabuf_t *bp2;
+			xfs_dir_leafblock_t *leaf2;
+
+			ASSERT(nextda != -1);
+
+			retval = xfs_da_read_buf(dp->i_transp, dp,
+						 INT_GET(leaf->hdr.info.forw,
+						 ARCH_CONVERT), nextda,
+						 &bp2, XFS_DATA_FORK);
+			if (retval)
+				return(retval);
+
+			ASSERT(bp2 != NULL);
+
+			leaf2 = bp2->data;
+
+			if (   (INT_GET(leaf2->hdr.info.magic, ARCH_CONVERT)
+						!= XFS_DIR_LEAF_MAGIC)
+			    || (INT_GET(leaf2->hdr.info.back, ARCH_CONVERT)
+						!= bno)) {	/* GROT */
+
+				xfs_da_brelse(dp->i_transp, bp2);
+
+				return(XFS_ERROR(EFSCORRUPTED));
+			}
+
+			nexthash = INT_GET(leaf2->entries[0].hashval,
+								ARCH_CONVERT);
+			nextentno = -1;
+
+			xfs_da_brelse(dp->i_transp, bp2);
+			xfs_dir_trace_g_duc("leaf: next blk cookie",
+						   dp, uio, p.cook.o);
+		} else {
+			nextentno = -1;
+			nexthash  = XFS_DA_MAXHASH;
+		}
+
+		/*
+		 * Save off the cookie so we can fall back should the
+		 * 'put' into the outgoing buffer fails.  To handle a run
+		 * of equal-hashvals, the off_t structure on 64bit
+		 * builds has entno built into the cookie to ID the
+		 * entry.  On 32bit builds, we only have space for the
+		 * hashval so we can't ID specific entries within a group
+		 * of same hashval entries.   For this, lastoffset is set
+		 * to the first in the run of equal hashvals so we don't
+		 * include any entries unless we can include all entries
+		 * that share the same hashval.	 Hopefully the buffer
+		 * provided is big enough to handle it (see pv763517).
+		 */
+#if (BITS_PER_LONG == 32)
+		if (INT_GET(entry->hashval, ARCH_CONVERT) != lasthash) {
+			XFS_PUT_COOKIE(lastoffset, mp, bno, entno, thishash);
+			lastresid = uio->uio_resid;
+			lasthash = thishash;
+		} else {
+			xfs_dir_trace_g_duc("leaf: DUP COOKIES, skipped",
+						   dp, uio, p.cook.o);
+		}
+#else
+		XFS_PUT_COOKIE(lastoffset, mp, bno, entno, thishash);
+		lastresid = uio->uio_resid;
+#endif /* BITS_PER_LONG == 32 */
+
+		/*
+		 * Put the current entry into the outgoing buffer.  If we fail
+		 * then restore the UIO to the first entry in the current
+		 * run of equal-hashval entries (probably one 1 entry long).
+		 */
+#if XFS_BIG_FILESYSTEMS
+		p.ino = XFS_GET_DIR_INO_ARCH(mp, namest->inumber, ARCH_CONVERT) + mp->m_inoadd;
+#else
+		p.ino = XFS_GET_DIR_INO_ARCH(mp, namest->inumber, ARCH_CONVERT);
+#endif
+		p.name = (char *)namest->name;
+		p.namelen = entry->namelen;
+
+		retval = p.put(&p);
+
+		if (!p.done) {
+			uio->uio_offset = lastoffset.o;
+			uio->uio_resid = lastresid;
+
+			*eobp = 1;
+
+			xfs_dir_trace_g_du("leaf: E-O-B", dp, uio);
+
+			return(retval);
+		}
+	}
+
+	XFS_PUT_COOKIE(p.cook, mp, 0, 0, nexthash);
+
+	uio->uio_offset = p.cook.o;
+
+	*eobp = 0;
+
+	xfs_dir_trace_g_du("leaf: E-O-F", dp, uio);
+
+	return(0);
+}
+
+/*
+ * Format a dirent64 structure and copy it out the the user's buffer.
+ */
+int
+xfs_dir_put_dirent64_direct(xfs_dir_put_args_t *pa)
+{
+	iovec_t *iovp;
+	int reclen, namelen;
+	xfs_dirent_t *idbp;
+	uio_t *uio;
+
+	namelen = pa->namelen;
+	reclen = DIRENTSIZE(namelen);
+	uio = pa->uio;
+	if (reclen > uio->uio_resid) {
+		pa->done = 0;
+		return 0;
+	}
+	iovp = uio->uio_iov;
+	idbp = (xfs_dirent_t *)iovp->iov_base;
+	iovp->iov_base = (char *)idbp + reclen;
+	iovp->iov_len -= reclen;
+	uio->uio_resid -= reclen;
+	idbp->d_reclen = reclen;
+	idbp->d_ino = pa->ino;
+	idbp->d_off = pa->cook.o;
+	idbp->d_name[namelen] = '\0';
+	pa->done = 1;
+	bcopy(pa->name, idbp->d_name, namelen);
+	return 0;
+}
+
+/*
+ * Format a dirent64 structure and copy it out the the user's buffer.
+ */
+int
+xfs_dir_put_dirent64_uio(xfs_dir_put_args_t *pa)
+{
+	int		retval, reclen, namelen;
+	xfs_dirent_t	*idbp;
+	uio_t		*uio;
+
+	namelen = pa->namelen;
+	reclen = DIRENTSIZE(namelen);
+	uio = pa->uio;
+	if (reclen > uio->uio_resid) {
+		pa->done = 0;
+		return 0;
+	}
+	idbp = pa->dbp;
+	idbp->d_reclen = reclen;
+	idbp->d_ino = pa->ino;
+	idbp->d_off = pa->cook.o;
+	idbp->d_name[namelen] = '\0';
+	bcopy(pa->name, idbp->d_name, namelen);
+	retval = uiomove((caddr_t)idbp, reclen, UIO_READ, uio);
+	pa->done = (retval == 0);
+	return retval;
+}
diff --git a/fs/xfs/xfs_dir_leaf.h b/fs/xfs/xfs_dir_leaf.h
new file mode 100644
index 000000000000..d81eebdcc0a5
--- /dev/null
+++ b/fs/xfs/xfs_dir_leaf.h
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR_LEAF_H__
+#define __XFS_DIR_LEAF_H__
+
+/*
+ * Directory layout, internal structure, access macros, etc.
+ *
+ * Large directories are structured around Btrees where all the data
+ * elements are in the leaf nodes.  Filenames are hashed into an int,
+ * then that int is used as the index into the Btree.  Since the hashval
+ * of a filename may not be unique, we may have duplicate keys.	 The
+ * internal links in the Btree are logical block offsets into the file.
+ */
+
+struct dirent;
+struct uio;
+struct xfs_bmap_free;
+struct xfs_dabuf;
+struct xfs_da_args;
+struct xfs_da_state;
+struct xfs_da_state_blk;
+struct xfs_dir_put_args;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*========================================================================
+ * Directory Structure when equal to XFS_LBSIZE(mp) bytes.
+ *========================================================================*/
+
+/*
+ * This is the structure of the leaf nodes in the Btree.
+ *
+ * Struct leaf_entry's are packed from the top.	 Names grow from the bottom
+ * but are not packed.	The freemap contains run-length-encoded entries
+ * for the free bytes after the leaf_entry's, but only the N largest such,
+ * smaller runs are dropped.  When the freemap doesn't show enough space
+ * for an allocation, we compact the namelist area and try again.  If we
+ * still don't have enough space, then we have to split the block.
+ *
+ * Since we have duplicate hash keys, for each key that matches, compare
+ * the actual string.  The root and intermediate node search always takes
+ * the first-in-the-block key match found, so we should only have to work
+ * "forw"ard.  If none matches, continue with the "forw"ard leaf nodes
+ * until the hash key changes or the filename is found.
+ *
+ * The parent directory and the self-pointer are explicitly represented
+ * (ie: there are entries for "." and "..").
+ *
+ * Note that the count being a __uint16_t limits us to something like a
+ * blocksize of 1.3MB in the face of worst case (short) filenames.
+ */
+#define XFS_DIR_LEAF_MAPSIZE	3	/* how many freespace slots */
+
+typedef struct xfs_dir_leafblock {
+	struct xfs_dir_leaf_hdr {	/* constant-structure header block */
+		xfs_da_blkinfo_t info;	/* block type, links, etc. */
+		__uint16_t count;	/* count of active leaf_entry's */
+		__uint16_t namebytes;	/* num bytes of name strings stored */
+		__uint16_t firstused;	/* first used byte in name area */
+		__uint8_t  holes;	/* != 0 if blk needs compaction */
+		__uint8_t  pad1;
+		struct xfs_dir_leaf_map {/* RLE map of free bytes */
+			__uint16_t base; /* base of free region */
+			__uint16_t size; /* run length of free region */
+		} freemap[XFS_DIR_LEAF_MAPSIZE]; /* N largest free regions */
+	} hdr;
+	struct xfs_dir_leaf_entry {	/* sorted on key, not name */
+		xfs_dahash_t hashval;	/* hash value of name */
+		__uint16_t nameidx;	/* index into buffer of name */
+		__uint8_t namelen;	/* length of name string */
+		__uint8_t pad2;
+	} entries[1];			/* var sized array */
+	struct xfs_dir_leaf_name {
+		xfs_dir_ino_t inumber;	/* inode number for this key */
+		__uint8_t name[1];	/* name string itself */
+	} namelist[1];			/* grows from bottom of buf */
+} xfs_dir_leafblock_t;
+typedef struct xfs_dir_leaf_hdr xfs_dir_leaf_hdr_t;
+typedef struct xfs_dir_leaf_map xfs_dir_leaf_map_t;
+typedef struct xfs_dir_leaf_entry xfs_dir_leaf_entry_t;
+typedef struct xfs_dir_leaf_name xfs_dir_leaf_name_t;
+
+/*
+ * Length of name for which a 512-byte block filesystem
+ * can get a double split.
+ */
+#define XFS_DIR_LEAF_CAN_DOUBLE_SPLIT_LEN	\
+	(512 - (uint)sizeof(xfs_dir_leaf_hdr_t) - \
+	 (uint)sizeof(xfs_dir_leaf_entry_t) * 2 - \
+	 (uint)sizeof(xfs_dir_leaf_name_t) * 2 - (MAXNAMELEN - 2) + 1 + 1)
+
+typedef int (*xfs_dir_put_t)(struct xfs_dir_put_args *pa);
+
+typedef union {
+	xfs_off_t		o;		/* offset (cookie) */
+	/*
+	 * Watch the order here (endian-ness dependent).
+	 */
+	struct {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+		xfs_dahash_t	h;	/* hash value */
+		__uint32_t	be;	/* block and entry */
+#else	/* __BYTE_ORDER == __BIG_ENDIAN */
+		__uint32_t	be;	/* block and entry */
+		xfs_dahash_t	h;	/* hash value */
+#endif	/* __BYTE_ORDER == __BIG_ENDIAN */
+	} s;
+} xfs_dircook_t;
+
+#define XFS_PUT_COOKIE(c,mp,bno,entry,hash)	\
+	((c).s.be = XFS_DA_MAKE_BNOENTRY(mp, bno, entry), (c).s.h = (hash))
+
+#define XFS_GET_DIR_INO_ARCH(mp,di,arch) \
+    DIRINO_GET_ARCH(&(di),arch)
+#define XFS_GET_DIR_INO(mp,di) \
+    XFS_GET_DIR_INO_ARCH(mp,di,ARCH_NOCONVERT)
+
+typedef struct xfs_dir_put_args
+{
+	xfs_dircook_t	cook;		/* cookie of (next) entry */
+	xfs_intino_t	ino;		/* inode number */
+	struct xfs_dirent	*dbp;		/* buffer pointer */
+	char		*name;		/* directory entry name */
+	int		namelen;	/* length of name */
+	int		done;		/* output: set if value was stored */
+	xfs_dir_put_t	put;		/* put function ptr (i/o) */
+	struct uio	*uio;		/* uio control structure */
+} xfs_dir_put_args_t;
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_LEAF_ENTSIZE_BYNAME)
+int xfs_dir_leaf_entsize_byname(int len);
+#define XFS_DIR_LEAF_ENTSIZE_BYNAME(len)	xfs_dir_leaf_entsize_byname(len)
+#else
+#define XFS_DIR_LEAF_ENTSIZE_BYNAME(len)	/* space a name will use */ \
+	((uint)sizeof(xfs_dir_leaf_name_t)-1 + len)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_LEAF_ENTSIZE_BYENTRY)
+int xfs_dir_leaf_entsize_byentry(xfs_dir_leaf_entry_t *entry);
+#define XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry)	\
+	xfs_dir_leaf_entsize_byentry(entry)
+#else
+#define XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry)	/* space an entry will use */ \
+	((uint)sizeof(xfs_dir_leaf_name_t)-1 + (entry)->namelen)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_LEAF_NAMESTRUCT)
+xfs_dir_leaf_name_t *
+xfs_dir_leaf_namestruct(xfs_dir_leafblock_t *leafp, int offset);
+#define XFS_DIR_LEAF_NAMESTRUCT(leafp,offset)	\
+	xfs_dir_leaf_namestruct(leafp,offset)
+#else
+#define XFS_DIR_LEAF_NAMESTRUCT(leafp,offset)	/* point to name struct */ \
+	((xfs_dir_leaf_name_t *)&((char *)(leafp))[offset])
+#endif
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Internal routines when dirsize < XFS_LITINO(mp).
+ */
+int xfs_dir_shortform_create(struct xfs_da_args *args, xfs_ino_t parent);
+int xfs_dir_shortform_addname(struct xfs_da_args *args);
+int xfs_dir_shortform_lookup(struct xfs_da_args *args);
+int xfs_dir_shortform_to_leaf(struct xfs_da_args *args);
+int xfs_dir_shortform_removename(struct xfs_da_args *args);
+int xfs_dir_shortform_getdents(struct xfs_inode *dp, struct uio *uio, int *eofp,
+				      struct xfs_dirent *dbp, xfs_dir_put_t put);
+int xfs_dir_shortform_replace(struct xfs_da_args *args);
+
+/*
+ * Internal routines when dirsize == XFS_LBSIZE(mp).
+ */
+int xfs_dir_leaf_to_node(struct xfs_da_args *args);
+int xfs_dir_leaf_to_shortform(struct xfs_da_args *args);
+
+/*
+ * Routines used for growing the Btree.
+ */
+int	xfs_dir_leaf_create(struct xfs_da_args *args, xfs_dablk_t which_block,
+				   struct xfs_dabuf **bpp);
+int	xfs_dir_leaf_split(struct xfs_da_state *state,
+				  struct xfs_da_state_blk *oldblk,
+				  struct xfs_da_state_blk *newblk);
+int	xfs_dir_leaf_add(struct xfs_dabuf *leaf_buffer,
+				struct xfs_da_args *args, int insertion_index);
+int	xfs_dir_leaf_addname(struct xfs_da_args *args);
+int	xfs_dir_leaf_lookup_int(struct xfs_dabuf *leaf_buffer,
+				       struct xfs_da_args *args,
+				       int *index_found_at);
+int	xfs_dir_leaf_remove(struct xfs_trans *trans,
+				   struct xfs_dabuf *leaf_buffer,
+				   int index_to_remove);
+int	xfs_dir_leaf_getdents_int(struct xfs_dabuf *bp, struct xfs_inode *dp,
+					 xfs_dablk_t bno, struct uio *uio,
+					 int *eobp, struct xfs_dirent *dbp,
+					 xfs_dir_put_t put, xfs_daddr_t nextda);
+
+/*
+ * Routines used for shrinking the Btree.
+ */
+int	xfs_dir_leaf_toosmall(struct xfs_da_state *state, int *retval);
+void	xfs_dir_leaf_unbalance(struct xfs_da_state *state,
+					     struct xfs_da_state_blk *drop_blk,
+					     struct xfs_da_state_blk *save_blk);
+
+/*
+ * Utility routines.
+ */
+uint	xfs_dir_leaf_lasthash(struct xfs_dabuf *bp, int *count);
+int	xfs_dir_leaf_order(struct xfs_dabuf *leaf1_bp,
+				  struct xfs_dabuf *leaf2_bp);
+int	xfs_dir_put_dirent64_direct(xfs_dir_put_args_t *pa);
+int	xfs_dir_put_dirent64_uio(xfs_dir_put_args_t *pa);
+int	xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
+
+
+/*
+ * Global data.
+ */
+extern xfs_dahash_t	xfs_dir_hash_dot, xfs_dir_hash_dotdot;
+
+#endif /* __XFS_DIR_LEAF_H__ */
diff --git a/fs/xfs/xfs_dir_sf.h b/fs/xfs/xfs_dir_sf.h
new file mode 100644
index 000000000000..ede171472223
--- /dev/null
+++ b/fs/xfs/xfs_dir_sf.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DIR_SF_H__
+#define __XFS_DIR_SF_H__
+
+/*
+ * Directory layout when stored internal to an inode.
+ *
+ * Small directories are packed as tightly as possible so as to
+ * fit into the literal area of the inode.
+ */
+
+typedef struct { __uint8_t i[sizeof(xfs_ino_t)]; } xfs_dir_ino_t;
+
+/*
+ * The parent directory has a dedicated field, and the self-pointer must
+ * be calculated on the fly.
+ *
+ * Entries are packed toward the top as tight as possible.  The header
+ * and the elements much be bcopy()'d out into a work area to get correct
+ * alignment for the inode number fields.
+ */
+typedef struct xfs_dir_shortform {
+	struct xfs_dir_sf_hdr {		/* constant-structure header block */
+		xfs_dir_ino_t parent;	/* parent dir inode number */
+		__uint8_t count;	/* count of active entries */
+	} hdr;
+	struct xfs_dir_sf_entry {
+		xfs_dir_ino_t inumber;	/* referenced inode number */
+		__uint8_t namelen;	/* actual length of name (no NULL) */
+		__uint8_t name[1];	/* name */
+	} list[1];			/* variable sized array */
+} xfs_dir_shortform_t;
+typedef struct xfs_dir_sf_hdr xfs_dir_sf_hdr_t;
+typedef struct xfs_dir_sf_entry xfs_dir_sf_entry_t;
+
+/*
+ * We generate this then sort it, so that readdirs are returned in
+ * hash-order.	Else seekdir won't work.
+ */
+typedef struct xfs_dir_sf_sort {
+	__uint8_t	entno;		/* .=0, ..=1, else entry# + 2 */
+	__uint8_t	seqno;		/* sequence # with same hash value */
+	__uint8_t	namelen;	/* length of name value (no null) */
+	xfs_dahash_t	hash;		/* this entry's hash value */
+	xfs_intino_t	ino;		/* this entry's inode number */
+	char		*name;		/* name value, pointer into buffer */
+} xfs_dir_sf_sort_t;
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_GET_DIRINO)
+void xfs_dir_sf_get_dirino_arch(xfs_dir_ino_t *from, xfs_ino_t *to, xfs_arch_t arch);
+void xfs_dir_sf_get_dirino(xfs_dir_ino_t *from, xfs_ino_t *to);
+#define XFS_DIR_SF_GET_DIRINO_ARCH(from,to,arch)    xfs_dir_sf_get_dirino_arch(from, to, arch)
+#define XFS_DIR_SF_GET_DIRINO(from,to)		    xfs_dir_sf_get_dirino(from, to)
+#else
+#define XFS_DIR_SF_GET_DIRINO_ARCH(from,to,arch)    DIRINO_COPY_ARCH(from,to,arch)
+#define XFS_DIR_SF_GET_DIRINO(from,to)		    DIRINO_COPY_ARCH(from,to,ARCH_NOCONVERT)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_PUT_DIRINO)
+void xfs_dir_sf_put_dirino_arch(xfs_ino_t *from, xfs_dir_ino_t *to, xfs_arch_t arch);
+void xfs_dir_sf_put_dirino(xfs_ino_t *from, xfs_dir_ino_t *to);
+#define XFS_DIR_SF_PUT_DIRINO_ARCH(from,to,arch)    xfs_dir_sf_put_dirino_arch(from, to, arch)
+#define XFS_DIR_SF_PUT_DIRINO(from,to)		    xfs_dir_sf_put_dirino(from, to)
+#else
+#define XFS_DIR_SF_PUT_DIRINO_ARCH(from,to,arch)    DIRINO_COPY_ARCH(from,to,arch)
+#define XFS_DIR_SF_PUT_DIRINO(from,to)		    DIRINO_COPY_ARCH(from,to,ARCH_NOCONVERT)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_ENTSIZE_BYNAME)
+int xfs_dir_sf_entsize_byname(int len);
+#define XFS_DIR_SF_ENTSIZE_BYNAME(len)		xfs_dir_sf_entsize_byname(len)
+#else
+#define XFS_DIR_SF_ENTSIZE_BYNAME(len)		/* space a name uses */ \
+	((uint)sizeof(xfs_dir_sf_entry_t)-1 + (len))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_ENTSIZE_BYENTRY)
+int xfs_dir_sf_entsize_byentry(xfs_dir_sf_entry_t *sfep);
+#define XFS_DIR_SF_ENTSIZE_BYENTRY(sfep)	xfs_dir_sf_entsize_byentry(sfep)
+#else
+#define XFS_DIR_SF_ENTSIZE_BYENTRY(sfep)	/* space an entry uses */ \
+	((uint)sizeof(xfs_dir_sf_entry_t)-1 + (sfep)->namelen)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_NEXTENTRY)
+xfs_dir_sf_entry_t *xfs_dir_sf_nextentry(xfs_dir_sf_entry_t *sfep);
+#define XFS_DIR_SF_NEXTENTRY(sfep)		xfs_dir_sf_nextentry(sfep)
+#else
+#define XFS_DIR_SF_NEXTENTRY(sfep)		/* next entry in struct */ \
+	((xfs_dir_sf_entry_t *) \
+		((char *)(sfep) + XFS_DIR_SF_ENTSIZE_BYENTRY(sfep)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DIR_SF_ALLFIT)
+int xfs_dir_sf_allfit(int count, int totallen);
+#define XFS_DIR_SF_ALLFIT(count,totallen)	\
+	xfs_dir_sf_allfit(count,totallen)
+#else
+#define XFS_DIR_SF_ALLFIT(count,totallen)	/* will all entries fit? */ \
+	((uint)sizeof(xfs_dir_sf_hdr_t) + \
+	       ((uint)sizeof(xfs_dir_sf_entry_t)-1)*(count) + (totallen))
+#endif
+
+#ifdef XFS_ALL_TRACE
+#define XFS_DIR_TRACE
+#endif
+
+#if !defined(DEBUG)
+#undef XFS_DIR_TRACE
+#endif
+
+/*
+ * Kernel tracing support for directories.
+ */
+struct uio;
+struct xfs_inode;
+struct xfs_da_intnode;
+struct xfs_dinode;
+struct xfs_dir_leafblock;
+struct xfs_dir_leaf_entry;
+
+#define XFS_DIR_TRACE_SIZE	4096	/* size of global trace buffer */
+
+/*
+ * Trace record types.
+ */
+#define XFS_DIR_KTRACE_G_DU	1	/* dp, uio */
+#define XFS_DIR_KTRACE_G_DUB	2	/* dp, uio, bno */
+#define XFS_DIR_KTRACE_G_DUN	3	/* dp, uio, node */
+#define XFS_DIR_KTRACE_G_DUL	4	/* dp, uio, leaf */
+#define XFS_DIR_KTRACE_G_DUE	5	/* dp, uio, leaf entry */
+#define XFS_DIR_KTRACE_G_DUC	6	/* dp, uio, cookie */
+
+#if defined(XFS_DIR_TRACE)
+
+void xfs_dir_trace_g_du(char *where, struct xfs_inode *dp, struct uio *uio);
+void xfs_dir_trace_g_dub(char *where, struct xfs_inode *dp, struct uio *uio,
+			      xfs_dablk_t bno);
+void xfs_dir_trace_g_dun(char *where, struct xfs_inode *dp, struct uio *uio,
+			      struct xfs_da_intnode *node);
+void xfs_dir_trace_g_dul(char *where, struct xfs_inode *dp, struct uio *uio,
+			      struct xfs_dir_leafblock *leaf);
+void xfs_dir_trace_g_due(char *where, struct xfs_inode *dp, struct uio *uio,
+			      struct xfs_dir_leaf_entry *entry);
+void xfs_dir_trace_g_duc(char *where, struct xfs_inode *dp, struct uio *uio,
+			      xfs_off_t cookie);
+void xfs_dir_trace_enter(int type, char *where,
+			     __psunsigned_t a0, __psunsigned_t a1,
+			     __psunsigned_t a2, __psunsigned_t a3,
+			     __psunsigned_t a4, __psunsigned_t a5,
+			     __psunsigned_t a6, __psunsigned_t a7,
+			     __psunsigned_t a8, __psunsigned_t a9,
+			     __psunsigned_t a10, __psunsigned_t a11);
+#else
+#define xfs_dir_trace_g_du(w,d,u)
+#define xfs_dir_trace_g_dub(w,d,u,b)
+#define xfs_dir_trace_g_dun(w,d,u,n)
+#define xfs_dir_trace_g_dul(w,d,u,l)
+#define xfs_dir_trace_g_due(w,d,u,e)
+#define xfs_dir_trace_g_duc(w,d,u,c)
+#endif /* DEBUG */
+
+#endif	/* __XFS_DIR_SF_H__ */
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
new file mode 100644
index 000000000000..dbfff6f70ad9
--- /dev/null
+++ b/fs/xfs/xfs_dmapi.h
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DMAPI_H__
+#define __XFS_DMAPI_H__
+
+#ifdef CONFIG_XFS_DMAPI
+
+#include <dmapi/dmapi.h>
+#include <dmapi/dmapi_kern.h>
+
+/*	Values used to define the on-disk version of dm_attrname_t. All
+ *	on-disk attribute names start with the 8-byte string "SGI_DMI_".
+ *
+ *	In the on-disk inode, DMAPI attribute names consist of the user-provided
+ *	name with the DMATTR_PREFIXSTRING pre-pended.  This string must NEVER be
+ *	changed.
+ */
+
+#define DMATTR_PREFIXLEN	8
+#define DMATTR_PREFIXSTRING	"SGI_DMI_"
+
+/* Defines for determining if an event message should be sent. */
+#define DM_EVENT_ENABLED(vfsp, ip, event) ( \
+	unlikely ((vfsp)->vfs_flag & VFS_DMI) && \
+		( ((ip)->i_d.di_dmevmask & (1 << event)) || \
+		  ((ip)->i_mount->m_dmevmask & (1 << event)) ) \
+	)
+
+#define DM_EVENT_ENABLED_IO(vfsp, io, event) ( \
+	unlikely ((vfsp)->vfs_flag & VFS_DMI) && \
+		( ((io)->io_dmevmask & (1 << event)) || \
+		  ((io)->io_mount->m_dmevmask & (1 << event)) ) \
+	)
+
+/*
+ *	Macros to turn caller specified delay/block flags into
+ *	dm_send_xxxx_event flag DM_FLAGS_NDELAY.
+ */
+
+#define FILP_DELAY_FLAG(filp) ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) ? \
+			DM_FLAGS_NDELAY : 0)
+#define AT_DELAY_FLAG(f) ((f&ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0)
+
+
+
+/* events valid in dm_set_eventlist() when called with a filesystem handle.
+   These events are not persistent.
+*/
+
+#define DM_XFS_VALID_FS_EVENTS		( \
+	(1 << DM_EVENT_PREUNMOUNT)	| \
+	(1 << DM_EVENT_UNMOUNT)		| \
+	(1 << DM_EVENT_NOSPACE)		| \
+	(1 << DM_EVENT_DEBUT)		| \
+	(1 << DM_EVENT_CREATE)		| \
+	(1 << DM_EVENT_POSTCREATE)	| \
+	(1 << DM_EVENT_REMOVE)		| \
+	(1 << DM_EVENT_POSTREMOVE)	| \
+	(1 << DM_EVENT_RENAME)		| \
+	(1 << DM_EVENT_POSTRENAME)	| \
+	(1 << DM_EVENT_LINK)		| \
+	(1 << DM_EVENT_POSTLINK)	| \
+	(1 << DM_EVENT_SYMLINK)		| \
+	(1 << DM_EVENT_POSTSYMLINK)	| \
+	(1 << DM_EVENT_ATTRIBUTE)	| \
+	(1 << DM_EVENT_DESTROY)		)
+
+/* Events valid in dm_set_eventlist() when called with a file handle for
+   a regular file or a symlink.	 These events are persistent.
+*/
+
+#define DM_XFS_VALID_FILE_EVENTS	( \
+	(1 << DM_EVENT_ATTRIBUTE)	| \
+	(1 << DM_EVENT_DESTROY)		)
+
+/* Events valid in dm_set_eventlist() when called with a file handle for
+   a directory.	 These events are persistent.
+*/
+
+#define DM_XFS_VALID_DIRECTORY_EVENTS	( \
+	(1 << DM_EVENT_CREATE)		| \
+	(1 << DM_EVENT_POSTCREATE)	| \
+	(1 << DM_EVENT_REMOVE)		| \
+	(1 << DM_EVENT_POSTREMOVE)	| \
+	(1 << DM_EVENT_RENAME)		| \
+	(1 << DM_EVENT_POSTRENAME)	| \
+	(1 << DM_EVENT_LINK)		| \
+	(1 << DM_EVENT_POSTLINK)	| \
+	(1 << DM_EVENT_SYMLINK)		| \
+	(1 << DM_EVENT_POSTSYMLINK)	| \
+	(1 << DM_EVENT_ATTRIBUTE)	| \
+	(1 << DM_EVENT_DESTROY)		)
+
+
+/* Events supported by the XFS filesystem. */
+#define DM_XFS_SUPPORTED_EVENTS		( \
+	(1 << DM_EVENT_MOUNT)		| \
+	(1 << DM_EVENT_PREUNMOUNT)	| \
+	(1 << DM_EVENT_UNMOUNT)		| \
+	(1 << DM_EVENT_NOSPACE)		| \
+	(1 << DM_EVENT_CREATE)		| \
+	(1 << DM_EVENT_POSTCREATE)	| \
+	(1 << DM_EVENT_REMOVE)		| \
+	(1 << DM_EVENT_POSTREMOVE)	| \
+	(1 << DM_EVENT_RENAME)		| \
+	(1 << DM_EVENT_POSTRENAME)	| \
+	(1 << DM_EVENT_LINK)		| \
+	(1 << DM_EVENT_POSTLINK)	| \
+	(1 << DM_EVENT_SYMLINK)		| \
+	(1 << DM_EVENT_POSTSYMLINK)	| \
+	(1 << DM_EVENT_READ)		| \
+	(1 << DM_EVENT_WRITE)		| \
+	(1 << DM_EVENT_TRUNCATE)	| \
+	(1 << DM_EVENT_ATTRIBUTE)	| \
+	(1 << DM_EVENT_DESTROY)		)
+
+
+extern int
+xfs_dm_mount(
+	vfs_t		*vfsp,
+	char		*dir_name,
+	char		*fsname);
+
+extern int
+xfs_dm_get_fsys_vector(
+	bhv_desc_t	*bdp,
+	dm_fcntl_vector_t *vecrq);
+
+extern int
+xfs_dm_send_data_event(
+	dm_eventtype_t	event,
+	bhv_desc_t	*bdp,
+	xfs_off_t	offset,
+	size_t		length,
+	int		flags,
+	vrwlock_t	*locktype);
+
+extern int
+xfs_dm_send_create_event(
+	bhv_desc_t	*dir_bdp,
+	char		*name,
+	mode_t		new_mode,
+	int		*good_event_sent);
+
+extern int
+xfs_dm_send_mmap_event(
+	struct vm_area_struct *vma,
+	unsigned int	wantflag);
+
+#else /* CONFIG_XFS_DMAPI */
+
+/*
+ *	Flags needed to build with dmapi disabled.
+ */
+
+typedef enum {
+	DM_EVENT_INVALID	= -1,
+	DM_EVENT_CANCEL		= 0,		/* not supported */
+	DM_EVENT_MOUNT		= 1,
+	DM_EVENT_PREUNMOUNT	= 2,
+	DM_EVENT_UNMOUNT	= 3,
+	DM_EVENT_DEBUT		= 4,		/* not supported */
+	DM_EVENT_CREATE		= 5,
+	DM_EVENT_CLOSE		= 6,		/* not supported */
+	DM_EVENT_POSTCREATE	= 7,
+	DM_EVENT_REMOVE		= 8,
+	DM_EVENT_POSTREMOVE	= 9,
+	DM_EVENT_RENAME		= 10,
+	DM_EVENT_POSTRENAME	= 11,
+	DM_EVENT_LINK		= 12,
+	DM_EVENT_POSTLINK	= 13,
+	DM_EVENT_SYMLINK	= 14,
+	DM_EVENT_POSTSYMLINK	= 15,
+	DM_EVENT_READ		= 16,
+	DM_EVENT_WRITE		= 17,
+	DM_EVENT_TRUNCATE	= 18,
+	DM_EVENT_ATTRIBUTE	= 19,
+	DM_EVENT_DESTROY	= 20,
+	DM_EVENT_NOSPACE	= 21,
+	DM_EVENT_USER		= 22,
+	DM_EVENT_MAX		= 23
+} dm_eventtype_t;
+
+typedef enum {
+	DM_RIGHT_NULL,
+	DM_RIGHT_SHARED,
+	DM_RIGHT_EXCL
+} dm_right_t;
+
+/*
+ *	Defines for determining if an event message should be sent.
+ */
+#define DM_EVENT_ENABLED(vfsp, ip, event)	0
+#define DM_EVENT_ENABLED_IO(vfsp, io, event)	0
+
+/*
+ *	Stubbed out DMAPI delay macros.
+ */
+
+#define FILP_DELAY_FLAG(filp)			0
+#define AT_DELAY_FLAG(f)			0
+
+/*
+ *	Events supported by the XFS filesystem.
+ */
+
+#define DM_XFS_VALID_FS_EVENTS			0
+#define DM_XFS_VALID_FILE_EVENTS		0
+#define DM_XFS_VALID_DIRECTORY_EVENTS		0
+#define DM_XFS_SUPPORTED_EVENTS			0
+
+/*
+ *	Dummy definitions used for the flags field on dm_send_*_event().
+ */
+
+#define DM_FLAGS_NDELAY		0x001	/* return EAGAIN after dm_pending() */
+#define DM_FLAGS_UNWANTED	0x002	/* event not in fsys dm_eventset_t */
+
+/*
+ *	Stubs for XFS DMAPI utility routines.
+ */
+
+static __inline int
+xfs_dm_send_create_event(
+	bhv_desc_t	*dir_bdp,
+	char		*name,
+	mode_t		new_mode,
+	int		*good_event_sent)
+{
+	return 0;
+}
+
+static __inline int
+xfs_dm_send_data_event(
+	dm_eventtype_t	event,
+	bhv_desc_t	*bdp,
+	xfs_off_t	offset,
+	size_t		length,
+	int		flags,
+	vrwlock_t	*locktype)
+{
+	return nopkg();
+}
+
+static __inline int
+xfs_dm_send_mmap_event(
+	struct vm_area_struct *vma,
+	unsigned int	wantflag)
+{
+	return 0;
+}
+
+/*
+ *	Stubs for routines needed for the X/Open version of DMAPI.
+ */
+
+static __inline int
+dm_send_destroy_event(
+	bhv_desc_t	*bdp,
+	dm_right_t	vp_right)
+{
+	return nopkg();
+}
+
+static __inline int
+dm_send_namesp_event(
+	dm_eventtype_t	event,
+	bhv_desc_t	*bdp1,
+	dm_right_t	vp1_right,
+	bhv_desc_t	*bdp2,
+	dm_right_t	vp2_right,
+	char		*name1,
+	char		*name2,
+	mode_t		mode,
+	int		retcode,
+	int		flags)
+{
+	return nopkg();
+}
+
+static __inline void
+dm_send_unmount_event(
+	vfs_t		*vfsp,
+	vnode_t		*vp,
+	dm_right_t	vfsp_right,
+	mode_t		mode,
+	int		retcode,
+	int		flags)
+{
+}
+
+#endif	/* CONFIG_XFS_DMAPI */
+#endif	/* __XFS_DMAPI_H__ */
diff --git a/fs/xfs/xfs_dqblk.h b/fs/xfs/xfs_dqblk.h
new file mode 100644
index 000000000000..40f6a4cd4a7b
--- /dev/null
+++ b/fs/xfs/xfs_dqblk.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DQBLK_H__
+#define __XFS_DQBLK_H__
+
+/*
+ * The ondisk form of a dquot structure.
+ */
+#define XFS_DQUOT_MAGIC		0x4451		/* 'DQ' */
+#define XFS_DQUOT_VERSION	(u_int8_t)0x01	/* latest version number */
+
+/*
+ * This is the main portion of the on-disk representation of quota
+ * information for a user. This is the q_core of the xfs_dquot_t that
+ * is kept in kernel memory. We pad this with some more expansion room
+ * to construct the on disk structure.
+ */
+typedef struct	xfs_disk_dquot {
+/*16*/	u_int16_t	d_magic;	/* dquot magic = XFS_DQUOT_MAGIC */
+/*8 */	u_int8_t	d_version;	/* dquot version */
+/*8 */	u_int8_t	d_flags;	/* XFS_DQ_USER/PROJ/GROUP */
+/*32*/	xfs_dqid_t	d_id;		/* user,project,group id */
+/*64*/	xfs_qcnt_t	d_blk_hardlimit;/* absolute limit on disk blks */
+/*64*/	xfs_qcnt_t	d_blk_softlimit;/* preferred limit on disk blks */
+/*64*/	xfs_qcnt_t	d_ino_hardlimit;/* maximum # allocated inodes */
+/*64*/	xfs_qcnt_t	d_ino_softlimit;/* preferred inode limit */
+/*64*/	xfs_qcnt_t	d_bcount;	/* disk blocks owned by the user */
+/*64*/	xfs_qcnt_t	d_icount;	/* inodes owned by the user */
+/*32*/	__int32_t	d_itimer;	/* zero if within inode limits if not,
+					   this is when we refuse service */
+/*32*/	__int32_t	d_btimer;	/* similar to above; for disk blocks */
+/*16*/	xfs_qwarncnt_t	d_iwarns;	/* warnings issued wrt num inodes */
+/*16*/	xfs_qwarncnt_t	d_bwarns;	/* warnings issued wrt disk blocks */
+/*32*/	__int32_t	d_pad0;		/* 64 bit align */
+/*64*/	xfs_qcnt_t	d_rtb_hardlimit;/* absolute limit on realtime blks */
+/*64*/	xfs_qcnt_t	d_rtb_softlimit;/* preferred limit on RT disk blks */
+/*64*/	xfs_qcnt_t	d_rtbcount;	/* realtime blocks owned */
+/*32*/	__int32_t	d_rtbtimer;	/* similar to above; for RT disk blocks */
+/*16*/	xfs_qwarncnt_t	d_rtbwarns;	/* warnings issued wrt RT disk blocks */
+/*16*/	__uint16_t	d_pad;
+} xfs_disk_dquot_t;
+
+/*
+ * This is what goes on disk. This is separated from the xfs_disk_dquot because
+ * carrying the unnecessary padding would be a waste of memory.
+ */
+typedef struct xfs_dqblk {
+	xfs_disk_dquot_t  dd_diskdq;	/* portion that lives incore as well */
+	char		  dd_fill[32];	/* filling for posterity */
+} xfs_dqblk_t;
+
+/*
+ * flags for q_flags field in the dquot.
+ */
+#define XFS_DQ_USER		0x0001		/* a user quota */
+/* #define XFS_DQ_PROJ		0x0002		-- project quota (IRIX) */
+#define XFS_DQ_GROUP		0x0004		/* a group quota */
+#define XFS_DQ_FLOCKED		0x0008		/* flush lock taken */
+#define XFS_DQ_DIRTY		0x0010		/* dquot is dirty */
+#define XFS_DQ_WANT		0x0020		/* for lookup/reclaim race */
+#define XFS_DQ_INACTIVE		0x0040		/* dq off mplist & hashlist */
+#define XFS_DQ_MARKER		0x0080		/* sentinel */
+
+/*
+ * In the worst case, when both user and group quotas are on,
+ * we can have a max of three dquots changing in a single transaction.
+ */
+#define XFS_DQUOT_LOGRES(mp)	(sizeof(xfs_disk_dquot_t) * 3)
+
+#endif	/* __XFS_DQBLK_H__ */
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
new file mode 100644
index 000000000000..f67497d78935
--- /dev/null
+++ b/fs/xfs/xfs_dquot.c
@@ -0,0 +1,1660 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <xfs_quota_priv.h>
+
+
+/*
+   LOCK ORDER
+
+   inode lock		    (ilock)
+   dquot hash-chain lock    (hashlock)
+   xqm dquot freelist lock  (freelistlock
+   mount's dquot list lock  (mplistlock)
+   user dquot lock - lock ordering among dquots is based on the uid or gid
+   group dquot lock - similar to udquots. Between the two dquots, the udquot
+		      has to be locked first.
+   pin lock - the dquot lock must be held to take this lock.
+   flush lock - ditto.
+*/
+
+STATIC void		xfs_qm_dqflush_done(xfs_buf_t *, xfs_dq_logitem_t *);
+
+#ifdef DEBUG
+dev_t xfs_dqerror_dev = 0;
+int xfs_do_dqerror = 0;
+int xfs_dqreq_num = 0;
+int xfs_dqerror_mod = 33;
+#endif
+
+/*
+ * Allocate and initialize a dquot. We don't always allocate fresh memory;
+ * we try to reclaim a free dquot if the number of incore dquots are above
+ * a threshold.
+ * The only field inside the core that gets initialized at this point
+ * is the d_id field. The idea is to fill in the entire q_core
+ * when we read in the on disk dquot.
+ */
+xfs_dquot_t *
+xfs_qm_dqinit(
+	xfs_mount_t  *mp,
+	xfs_dqid_t   id,
+	uint	     type)
+{
+	xfs_dquot_t	*dqp;
+	boolean_t	brandnewdquot;
+
+	brandnewdquot = xfs_qm_dqalloc_incore(&dqp);
+	dqp->dq_flags = type;
+	INT_SET(dqp->q_core.d_id, ARCH_CONVERT, id);
+	dqp->q_mount = mp;
+
+	/*
+	 * No need to re-initialize these if this is a reclaimed dquot.
+	 */
+	if (brandnewdquot) {
+		dqp->dq_flnext = dqp->dq_flprev = dqp;
+		mutex_init(&dqp->q_qlock,  MUTEX_DEFAULT, "xdq");
+		initnsema(&dqp->q_flock, 1, "fdq");
+		sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq");
+
+#ifdef DQUOT_TRACING
+		dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_SLEEP);
+		xfs_dqtrace_entry(dqp, "DQINIT");
+#endif
+	} else {
+		/*
+		 * Only the q_core portion was bzeroed in dqreclaim_one().
+		 * So, we need to reset others.
+		 */
+		 dqp->q_nrefs = 0;
+		 dqp->q_blkno = 0;
+		 dqp->MPL_NEXT = dqp->HL_NEXT = NULL;
+		 dqp->HL_PREVP = dqp->MPL_PREVP = NULL;
+		 dqp->q_bufoffset = 0;
+		 dqp->q_fileoffset = 0;
+		 dqp->q_transp = NULL;
+		 dqp->q_gdquot = NULL;
+		 dqp->q_res_bcount = 0;
+		 dqp->q_res_icount = 0;
+		 dqp->q_res_rtbcount = 0;
+		 dqp->q_pincount = 0;
+		 dqp->q_hash = 0;
+		 ASSERT(dqp->dq_flnext == dqp->dq_flprev);
+
+#ifdef DQUOT_TRACING
+		 ASSERT(dqp->q_trace);
+		 xfs_dqtrace_entry(dqp, "DQRECLAIMED_INIT");
+#endif
+	 }
+
+	/*
+	 * log item gets initialized later
+	 */
+	return (dqp);
+}
+
+/*
+ * This is called to free all the memory associated with a dquot
+ */
+void
+xfs_qm_dqdestroy(
+	xfs_dquot_t	*dqp)
+{
+	ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp));
+
+	mutex_destroy(&dqp->q_qlock);
+	freesema(&dqp->q_flock);
+	sv_destroy(&dqp->q_pinwait);
+
+#ifdef DQUOT_TRACING
+	if (dqp->q_trace)
+	     ktrace_free(dqp->q_trace);
+	dqp->q_trace = NULL;
+#endif
+	kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
+	atomic_dec(&xfs_Gqm->qm_totaldquots);
+}
+
+/*
+ * This is what a 'fresh' dquot inside a dquot chunk looks like on disk.
+ */
+STATIC void
+xfs_qm_dqinit_core(
+	xfs_dqid_t	 id,
+	uint		 type,
+	xfs_dqblk_t	 *d)
+{
+	/*
+	 * Caller has bzero'd the entire dquot 'chunk' already.
+	 */
+	INT_SET(d->dd_diskdq.d_magic, ARCH_CONVERT, XFS_DQUOT_MAGIC);
+	INT_SET(d->dd_diskdq.d_version, ARCH_CONVERT, XFS_DQUOT_VERSION);
+	INT_SET(d->dd_diskdq.d_id, ARCH_CONVERT, id);
+	INT_SET(d->dd_diskdq.d_flags, ARCH_CONVERT, type);
+}
+
+
+#ifdef DQUOT_TRACING
+/*
+ * Dquot tracing for debugging.
+ */
+/* ARGSUSED */
+void
+xfs_dqtrace_entry__(
+	xfs_dquot_t *dqp,
+	char *func,
+	void *retaddr,
+	xfs_inode_t *ip)
+{
+	xfs_dquot_t *udqp = NULL;
+	int ino;
+
+	ASSERT(dqp->q_trace);
+	if (ip) {
+		ino = ip->i_ino;
+		udqp = ip->i_udquot;
+	}
+	ktrace_enter(dqp->q_trace,
+		     (void *)(__psint_t)DQUOT_KTRACE_ENTRY,
+		     (void *)func,
+		     (void *)(__psint_t)dqp->q_nrefs,
+		     (void *)(__psint_t)dqp->dq_flags,
+		     (void *)(__psint_t)dqp->q_res_bcount,
+		     (void *)(__psint_t)INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT),
+		     (void *)(__psint_t)INT_GET(dqp->q_core.d_icount, ARCH_CONVERT),
+		     (void *)(__psint_t)INT_GET(dqp->q_core.d_blk_hardlimit, ARCH_CONVERT),
+		     (void *)(__psint_t)INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT),
+		     (void *)(__psint_t)INT_GET(dqp->q_core.d_ino_hardlimit, ARCH_CONVERT),
+		     (void *)(__psint_t)INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT),
+		     (void *)(__psint_t)INT_GET(dqp->q_core.d_id, ARCH_CONVERT), /* 11 */
+		     (void *)(__psint_t)current_pid(),
+		     (void *)(__psint_t)ino,
+		     (void *)(__psint_t)retaddr,
+		     (void *)(__psint_t)udqp);
+	return;
+}
+#endif
+
+
+/*
+ * Check the limits and timers of a dquot and start or reset timers
+ * if necessary.
+ * This gets called even when quota enforcement is OFF, which makes our
+ * life a little less complicated. (We just don't reject any quota
+ * reservations in that case, when enforcement is off).
+ * We also return 0 as the values of the timers in Q_GETQUOTA calls, when
+ * enforcement's off.
+ * In contrast, warnings are a little different in that they don't
+ * 'automatically' get started when limits get exceeded.
+ */
+void
+xfs_qm_adjust_dqtimers(
+	xfs_mount_t		*mp,
+	xfs_disk_dquot_t	*d)
+{
+	/*
+	 * The dquot had better be locked. We are modifying it here.
+	 */
+
+	/*
+	 * root's limits are not real limits.
+	 */
+	if (INT_ISZERO(d->d_id, ARCH_CONVERT))
+		return;
+
+#ifdef QUOTADEBUG
+	if (INT_GET(d->d_blk_hardlimit, ARCH_CONVERT))
+		ASSERT(INT_GET(d->d_blk_softlimit, ARCH_CONVERT) <= INT_GET(d->d_blk_hardlimit, ARCH_CONVERT));
+	if (INT_GET(d->d_ino_hardlimit, ARCH_CONVERT))
+		ASSERT(INT_GET(d->d_ino_softlimit, ARCH_CONVERT) <= INT_GET(d->d_ino_hardlimit, ARCH_CONVERT));
+#endif
+	if (INT_ISZERO(d->d_btimer, ARCH_CONVERT)) {
+		if ((INT_GET(d->d_blk_softlimit, ARCH_CONVERT) &&
+		    (INT_GET(d->d_bcount, ARCH_CONVERT) >= INT_GET(d->d_blk_softlimit, ARCH_CONVERT))) ||
+		    (INT_GET(d->d_blk_hardlimit, ARCH_CONVERT) &&
+		    (INT_GET(d->d_bcount, ARCH_CONVERT) >= INT_GET(d->d_blk_hardlimit, ARCH_CONVERT)))) {
+			INT_SET(d->d_btimer, ARCH_CONVERT, CURRENT_TIME + XFS_QI_BTIMELIMIT(mp));
+		}
+	} else {
+		if ((INT_ISZERO(d->d_blk_softlimit, ARCH_CONVERT) ||
+		    (INT_GET(d->d_bcount, ARCH_CONVERT) < INT_GET(d->d_blk_softlimit, ARCH_CONVERT))) &&
+		    (INT_ISZERO(d->d_blk_hardlimit, ARCH_CONVERT) ||
+		    (INT_GET(d->d_bcount, ARCH_CONVERT) < INT_GET(d->d_blk_hardlimit, ARCH_CONVERT)))) {
+			INT_ZERO(d->d_btimer, ARCH_CONVERT);
+		}
+	}
+
+	if (INT_ISZERO(d->d_itimer, ARCH_CONVERT)) {
+		if ((INT_GET(d->d_ino_softlimit, ARCH_CONVERT) &&
+		    (INT_GET(d->d_icount, ARCH_CONVERT) >= INT_GET(d->d_ino_softlimit, ARCH_CONVERT))) ||
+		    (INT_GET(d->d_ino_hardlimit, ARCH_CONVERT) &&
+		    (INT_GET(d->d_icount, ARCH_CONVERT) >= INT_GET(d->d_ino_hardlimit, ARCH_CONVERT)))) {
+			INT_SET(d->d_itimer, ARCH_CONVERT, CURRENT_TIME + XFS_QI_ITIMELIMIT(mp));
+		}
+	} else {
+		if ((INT_ISZERO(d->d_ino_softlimit, ARCH_CONVERT) ||
+		    (INT_GET(d->d_icount, ARCH_CONVERT) < INT_GET(d->d_ino_softlimit, ARCH_CONVERT)))  &&
+		    (INT_ISZERO(d->d_ino_hardlimit, ARCH_CONVERT) ||
+		    (INT_GET(d->d_icount, ARCH_CONVERT) < INT_GET(d->d_ino_hardlimit, ARCH_CONVERT)))) {
+			INT_ZERO(d->d_itimer, ARCH_CONVERT);
+		}
+	}
+}
+
+/*
+ * Increment or reset warnings of a given dquot.
+ */
+int
+xfs_qm_dqwarn(
+	xfs_disk_dquot_t	*d,
+	uint			flags)
+{
+	int	warned;
+
+	/*
+	 * root's limits are not real limits.
+	 */
+	if (INT_ISZERO(d->d_id, ARCH_CONVERT))
+		return (0);
+
+	warned = 0;
+	if (INT_GET(d->d_blk_softlimit, ARCH_CONVERT) &&
+	    (INT_GET(d->d_bcount, ARCH_CONVERT) >= INT_GET(d->d_blk_softlimit, ARCH_CONVERT))) {
+		if (flags & XFS_QMOPT_DOWARN) {
+			INT_MOD(d->d_bwarns, ARCH_CONVERT, +1);
+			warned++;
+		}
+	} else {
+		if (INT_ISZERO(d->d_blk_softlimit, ARCH_CONVERT) ||
+		    (INT_GET(d->d_bcount, ARCH_CONVERT) < INT_GET(d->d_blk_softlimit, ARCH_CONVERT))) {
+			INT_ZERO(d->d_bwarns, ARCH_CONVERT);
+		}
+	}
+
+	if (INT_GET(d->d_ino_softlimit, ARCH_CONVERT) > 0 &&
+	    (INT_GET(d->d_icount, ARCH_CONVERT) >= INT_GET(d->d_ino_softlimit, ARCH_CONVERT))) {
+		if (flags & XFS_QMOPT_DOWARN) {
+			INT_MOD(d->d_iwarns, ARCH_CONVERT, +1);
+			warned++;
+		}
+	} else {
+		if ((INT_ISZERO(d->d_ino_softlimit, ARCH_CONVERT)) ||
+		    (INT_GET(d->d_icount, ARCH_CONVERT) < INT_GET(d->d_ino_softlimit, ARCH_CONVERT))) {
+			INT_ZERO(d->d_iwarns, ARCH_CONVERT);
+		}
+	}
+#ifdef QUOTADEBUG
+	if (INT_GET(d->d_iwarns, ARCH_CONVERT))
+		printk("--------@@Inode warnings running : %Lu >= %Lu\n",
+		       INT_GET(d->d_icount, ARCH_CONVERT), INT_GET(d->d_ino_softlimit, ARCH_CONVERT));
+	if (INT_GET(d->d_bwarns, ARCH_CONVERT))
+		printk("--------@@Blks warnings running : %Lu >= %Lu\n",
+		       INT_GET(d->d_bcount, ARCH_CONVERT), INT_GET(d->d_blk_softlimit, ARCH_CONVERT));
+#endif
+	return (warned);
+}
+
+
+/*
+ * initialize a buffer full of dquots and log the whole thing
+ */
+STATIC void
+xfs_qm_init_dquot_blk(
+	xfs_trans_t	*tp,
+	xfs_mount_t	*mp,
+	xfs_dqid_t	id,
+	uint		type,
+	xfs_buf_t	*bp)
+{
+	xfs_dqblk_t	*d;
+	int		curid, i;
+
+	ASSERT(tp);
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+
+	d = (xfs_dqblk_t *)XFS_BUF_PTR(bp);
+
+	/*
+	 * ID of the first dquot in the block - id's are zero based.
+	 */
+	curid = id - (id % XFS_QM_DQPERBLK(mp));
+	ASSERT(curid >= 0);
+	bzero(d, BBTOB(XFS_QI_DQCHUNKLEN(mp)));
+	for (i = 0; i < XFS_QM_DQPERBLK(mp); i++, d++, curid++)
+		xfs_qm_dqinit_core(curid, type, d);
+	xfs_trans_dquot_buf(tp, bp,
+			    type & XFS_DQ_USER ?
+			    XFS_BLI_UDQUOT_BUF :
+			    XFS_BLI_GDQUOT_BUF);
+	xfs_trans_log_buf(tp, bp, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1);
+}
+
+
+
+/*
+ * Allocate a block and fill it with dquots.
+ * This is called when the bmapi finds a hole.
+ */
+STATIC int
+xfs_qm_dqalloc(
+	xfs_trans_t	*tp,
+	xfs_mount_t	*mp,
+	xfs_dquot_t	*dqp,
+	xfs_inode_t	*quotip,
+	xfs_fileoff_t	offset_fsb,
+	xfs_buf_t	**O_bpp)
+{
+	xfs_fsblock_t	firstblock;
+	xfs_bmap_free_t flist;
+	xfs_bmbt_irec_t map;
+	int		nmaps, error, committed;
+	xfs_buf_t	*bp;
+
+	ASSERT(tp != NULL);
+	xfs_dqtrace_entry(dqp, "DQALLOC");
+
+	/*
+	 * Initialize the bmap freelist prior to calling bmapi code.
+	 */
+	XFS_BMAP_INIT(&flist, &firstblock);
+	xfs_ilock(quotip, XFS_ILOCK_EXCL);
+	/*
+	 * Return if this type of quotas is turned off while we didn't
+	 * have an inode lock
+	 */
+	if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
+		xfs_iunlock(quotip, XFS_ILOCK_EXCL);
+		return (ESRCH);
+	}
+
+	/*
+	 * xfs_trans_commit normally decrements the vnode ref count
+	 * when it unlocks the inode. Since we want to keep the quota
+	 * inode around, we bump the vnode ref count now.
+	 */
+	VN_HOLD(XFS_ITOV(quotip));
+
+	xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
+	nmaps = 1;
+	if ((error = xfs_bmapi(tp, quotip,
+			      offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB,
+			      XFS_BMAPI_METADATA | XFS_BMAPI_WRITE,
+			      &firstblock,
+			      XFS_QM_DQALLOC_SPACE_RES(mp),
+			      &map, &nmaps, &flist))) {
+		goto error0;
+	}
+	ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
+	ASSERT(nmaps == 1);
+	ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+	       (map.br_startblock != HOLESTARTBLOCK));
+
+	/*
+	 * Keep track of the blkno to save a lookup later
+	 */
+	dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
+
+	/* now we can just get the buffer (there's nothing to read yet) */
+	bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+			       dqp->q_blkno,
+			       XFS_QI_DQCHUNKLEN(mp),
+			       0);
+	if (!bp || (error = XFS_BUF_GETERROR(bp)))
+		goto error1;
+	/*
+	 * Make a chunk of dquots out of this buffer and log
+	 * the entire thing.
+	 */
+	xfs_qm_init_dquot_blk(tp, mp, INT_GET(dqp->q_core.d_id, ARCH_CONVERT),
+			      dqp->dq_flags & (XFS_DQ_USER|XFS_DQ_GROUP),
+			      bp);
+
+	if ((error = xfs_bmap_finish(&tp, &flist, firstblock, &committed))) {
+		goto error1;
+	}
+
+	*O_bpp = bp;
+	return 0;
+
+      error1:
+	xfs_bmap_cancel(&flist);
+      error0:
+	xfs_iunlock(quotip, XFS_ILOCK_EXCL);
+
+	return (error);
+}
+
+/*
+ * Maps a dquot to the buffer containing its on-disk version.
+ * This returns a ptr to the buffer containing the on-disk dquot
+ * in the bpp param, and a ptr to the on-disk dquot within that buffer
+ */
+STATIC int
+xfs_qm_dqtobp(
+	xfs_trans_t		*tp,
+	xfs_dquot_t		*dqp,
+	xfs_disk_dquot_t	**O_ddpp,
+	xfs_buf_t		**O_bpp,
+	uint			flags)
+{
+	xfs_bmbt_irec_t map;
+	int		nmaps, error;
+	xfs_buf_t	*bp;
+	xfs_inode_t	*quotip;
+	xfs_mount_t	*mp;
+	xfs_disk_dquot_t *ddq;
+	xfs_dqid_t	id;
+	boolean_t	newdquot;
+
+	mp = dqp->q_mount;
+	id = INT_GET(dqp->q_core.d_id, ARCH_CONVERT);
+	nmaps = 1;
+	newdquot = B_FALSE;
+
+	/*
+	 * If we don't know where the dquot lives, find out.
+	 */
+	if (dqp->q_blkno == (xfs_daddr_t) 0) {
+		/* We use the id as an index */
+		dqp->q_fileoffset = (xfs_fileoff_t) ((uint)id /
+						     XFS_QM_DQPERBLK(mp));
+		nmaps = 1;
+		quotip = XFS_DQ_TO_QIP(dqp);
+		xfs_ilock(quotip, XFS_ILOCK_SHARED);
+		/*
+		 * Return if this type of quotas is turned off while we didn't
+		 * have an inode lock
+		 */
+		if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
+			xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+			return (ESRCH);
+		}
+		/*
+		 * Find the block map; no allocations yet
+		 */
+		error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
+				  XFS_DQUOT_CLUSTER_SIZE_FSB,
+				  XFS_BMAPI_METADATA,
+				  NULL, 0, &map, &nmaps, NULL);
+
+		xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+		if (error)
+			return (error);
+		ASSERT(nmaps == 1);
+		ASSERT(map.br_blockcount == 1);
+
+		/*
+		 * offset of dquot in the (fixed sized) dquot chunk.
+		 */
+		dqp->q_bufoffset = (id % XFS_QM_DQPERBLK(mp)) *
+			sizeof(xfs_dqblk_t);
+		if (map.br_startblock == HOLESTARTBLOCK) {
+			/*
+			 * We don't allocate unless we're asked to
+			 */
+			if (!(flags & XFS_QMOPT_DQALLOC))
+				return (ENOENT);
+
+			ASSERT(tp);
+			if ((error = xfs_qm_dqalloc(tp, mp, dqp, quotip,
+						dqp->q_fileoffset, &bp)))
+				return (error);
+			newdquot = B_TRUE;
+		} else {
+			/*
+			 * store the blkno etc so that we don't have to do the
+			 * mapping all the time
+			 */
+			dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
+		}
+	}
+	ASSERT(dqp->q_blkno != DELAYSTARTBLOCK);
+	ASSERT(dqp->q_blkno != HOLESTARTBLOCK);
+
+	/*
+	 * Read in the buffer, unless we've just done the allocation
+	 * (in which case we already have the buf).
+	 */
+	if (! newdquot) {
+		xfs_dqtrace_entry(dqp, "DQTOBP READBUF");
+		if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+					       dqp->q_blkno,
+					       XFS_QI_DQCHUNKLEN(mp),
+					       0, &bp))) {
+			return (error);
+		}
+		if (error || !bp)
+			return XFS_ERROR(error);
+	}
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+
+	/*
+	 * calculate the location of the dquot inside the buffer.
+	 */
+	ddq = (xfs_disk_dquot_t *)((char *)XFS_BUF_PTR(bp) + dqp->q_bufoffset);
+
+	/*
+	 * A simple sanity check in case we got a corrupted dquot...
+	 */
+	if (xfs_qm_dqcheck(ddq, id,
+			   dqp->dq_flags & (XFS_DQ_USER|XFS_DQ_GROUP),
+			   flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
+			   "dqtobp")) {
+		if (!(flags & XFS_QMOPT_DQREPAIR)) {
+			xfs_trans_brelse(tp, bp);
+			return XFS_ERROR(EIO);
+		}
+		XFS_BUF_BUSY(bp); /* We dirtied this */
+	}
+
+	*O_bpp = bp;
+	*O_ddpp = ddq;
+
+	return (0);
+}
+
+
+/*
+ * Read in the ondisk dquot using dqtobp() then copy it to an incore version,
+ * and release the buffer immediately.
+ *
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_dqread(
+	xfs_trans_t	*tp,
+	xfs_dqid_t	id,
+	xfs_dquot_t	*dqp,	/* dquot to get filled in */
+	uint		flags)
+{
+	xfs_disk_dquot_t *ddqp;
+	xfs_buf_t	 *bp;
+	int		 error;
+
+	/*
+	 * get a pointer to the on-disk dquot and the buffer containing it
+	 * dqp already knows its own type (GROUP/USER).
+	 */
+	xfs_dqtrace_entry(dqp, "DQREAD");
+	if ((error = xfs_qm_dqtobp(tp, dqp, &ddqp, &bp, flags))) {
+		return (error);
+	}
+
+	/* copy everything from disk dquot to the incore dquot */
+	bcopy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t));
+	ASSERT(INT_GET(dqp->q_core.d_id, ARCH_CONVERT) == id);
+	xfs_qm_dquot_logitem_init(dqp);
+
+	/*
+	 * Reservation counters are defined as reservation plus current usage
+	 * to avoid having to add everytime.
+	 */
+	dqp->q_res_bcount = INT_GET(ddqp->d_bcount, ARCH_CONVERT);
+	dqp->q_res_icount = INT_GET(ddqp->d_icount, ARCH_CONVERT);
+	dqp->q_res_rtbcount = INT_GET(ddqp->d_rtbcount, ARCH_CONVERT);
+
+	/* Mark the buf so that this will stay incore a little longer */
+	XFS_BUF_SET_VTYPE_REF(bp, B_FS_DQUOT, XFS_DQUOT_REF);
+
+	/*
+	 * We got the buffer with a xfs_trans_read_buf() (in dqtobp())
+	 * So we need to release with xfs_trans_brelse().
+	 * The strategy here is identical to that of inodes; we lock
+	 * the dquot in xfs_qm_dqget() before making it accessible to
+	 * others. This is because dquots, like inodes, need a good level of
+	 * concurrency, and we don't want to take locks on the entire buffers
+	 * for dquot accesses.
+	 * Note also that the dquot buffer may even be dirty at this point, if
+	 * this particular dquot was repaired. We still aren't afraid to
+	 * brelse it because we have the changes incore.
+	 */
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+	xfs_trans_brelse(tp, bp);
+
+	return (error);
+}
+
+
+/*
+ * allocate an incore dquot from the kernel heap,
+ * and fill its core with quota information kept on disk.
+ * If XFS_QMOPT_DQALLOC is set, it'll allocate a dquot on disk
+ * if it wasn't already allocated.
+ */
+STATIC int
+xfs_qm_idtodq(
+	xfs_mount_t	*mp,
+	xfs_dqid_t	id,	 /* gid or uid, depending on type */
+	uint		type,	 /* UDQUOT or GDQUOT */
+	uint		flags,	 /* DQALLOC, DQREPAIR */
+	xfs_dquot_t	**O_dqpp)/* OUT : incore dquot, not locked */
+{
+	xfs_dquot_t	*dqp;
+	int		error;
+	xfs_trans_t	*tp;
+	int		cancelflags=0;
+
+	dqp = xfs_qm_dqinit(mp, id, type);
+	tp = NULL;
+	if (flags & XFS_QMOPT_DQALLOC) {
+		tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
+		if ((error = xfs_trans_reserve(tp,
+				       XFS_QM_DQALLOC_SPACE_RES(mp),
+				       XFS_WRITE_LOG_RES(mp) +
+					      BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1 +
+					      128,
+				       0,
+				       XFS_TRANS_PERM_LOG_RES,
+				       XFS_WRITE_LOG_COUNT))) {
+			cancelflags = 0;
+			goto error0;
+		}
+		cancelflags = XFS_TRANS_RELEASE_LOG_RES;
+	}
+
+	/*
+	 * Read it from disk; xfs_dqread() takes care of
+	 * all the necessary initialization of dquot's fields (locks, etc)
+	 */
+	if ((error = xfs_qm_dqread(tp, id, dqp, flags))) {
+		/*
+		 * This can happen if quotas got turned off (ESRCH),
+		 * or if the dquot didn't exist on disk and we ask to
+		 * allocate (ENOENT).
+		 */
+		xfs_dqtrace_entry(dqp, "DQREAD FAIL");
+		cancelflags |= XFS_TRANS_ABORT;
+		goto error0;
+	}
+	if (tp) {
+		if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES,
+					     NULL)))
+			goto error1;
+	}
+
+	*O_dqpp = dqp;
+	ASSERT(! XFS_DQ_IS_LOCKED(dqp));
+	return (0);
+
+ error0:
+	ASSERT(error);
+	if (tp)
+		xfs_trans_cancel(tp, cancelflags);
+ error1:
+	xfs_qm_dqdestroy(dqp);
+	*O_dqpp = NULL;
+	return (error);
+}
+
+/*
+ * Lookup a dquot in the incore dquot hashtable. We keep two separate
+ * hashtables for user and group dquots; and, these are global tables
+ * inside the XQM, not per-filesystem tables.
+ * The hash chain must be locked by caller, and it is left locked
+ * on return. Returning dquot is locked.
+ */
+STATIC int
+xfs_qm_dqlookup(
+	xfs_mount_t		*mp,
+	xfs_dqid_t		id,
+	xfs_dqhash_t		*qh,
+	xfs_dquot_t		**O_dqpp)
+{
+	xfs_dquot_t		*dqp;
+	uint			flist_locked;
+	xfs_dquot_t		*d;
+
+	ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+
+	flist_locked = B_FALSE;
+
+	/*
+	 * Traverse the hashchain looking for a match
+	 */
+	for (dqp = qh->qh_next; dqp != NULL; dqp = dqp->HL_NEXT) {
+		/*
+		 * We already have the hashlock. We don't need the
+		 * dqlock to look at the id field of the dquot, since the
+		 * id can't be modified without the hashlock anyway.
+		 */
+		if (INT_GET(dqp->q_core.d_id, ARCH_CONVERT) == id && dqp->q_mount == mp) {
+			xfs_dqtrace_entry(dqp, "DQFOUND BY LOOKUP");
+			/*
+			 * All in core dquots must be on the dqlist of mp
+			 */
+			ASSERT(dqp->MPL_PREVP != NULL);
+
+			xfs_dqlock(dqp);
+			if (dqp->q_nrefs == 0) {
+				ASSERT (XFS_DQ_IS_ON_FREELIST(dqp));
+				if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
+					xfs_dqtrace_entry(dqp, "DQLOOKUP: WANT");
+
+					/*
+					 * We may have raced with dqreclaim_one()
+					 * (and lost). So, flag that we don't
+					 * want the dquot to be reclaimed.
+					 */
+					dqp->dq_flags |= XFS_DQ_WANT;
+					xfs_dqunlock(dqp);
+					xfs_qm_freelist_lock(xfs_Gqm);
+					xfs_dqlock(dqp);
+					dqp->dq_flags &= ~(XFS_DQ_WANT);
+				}
+				flist_locked = B_TRUE;
+			}
+
+			/*
+			 * id couldn't have changed; we had the hashlock all
+			 * along
+			 */
+			ASSERT(INT_GET(dqp->q_core.d_id, ARCH_CONVERT) == id);
+
+			if (flist_locked) {
+				if (dqp->q_nrefs != 0) {
+					xfs_qm_freelist_unlock(xfs_Gqm);
+					flist_locked = B_FALSE;
+				} else {
+					/*
+					 * take it off the freelist
+					 */
+					xfs_dqtrace_entry(dqp,
+							"DQLOOKUP: TAKEOFF FL");
+					XQM_FREELIST_REMOVE(dqp);
+					/* xfs_qm_freelist_print(&(xfs_Gqm->
+							qm_dqfreelist),
+							"after removal"); */
+				}
+			}
+
+			/*
+			 * grab a reference
+			 */
+			XFS_DQHOLD(dqp);
+
+			if (flist_locked)
+				xfs_qm_freelist_unlock(xfs_Gqm);
+			/*
+			 * move the dquot to the front of the hashchain
+			 */
+			ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+			if (dqp->HL_PREVP != &qh->qh_next) {
+				xfs_dqtrace_entry(dqp,
+						  "DQLOOKUP: HASH MOVETOFRONT");
+				if ((d = dqp->HL_NEXT))
+					d->HL_PREVP = dqp->HL_PREVP;
+				*(dqp->HL_PREVP) = d;
+				d = qh->qh_next;
+				d->HL_PREVP = &dqp->HL_NEXT;
+				dqp->HL_NEXT = d;
+				dqp->HL_PREVP = &qh->qh_next;
+				qh->qh_next = dqp;
+			}
+			xfs_dqtrace_entry(dqp, "LOOKUP END");
+			*O_dqpp = dqp;
+			ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+			return (0);
+		}
+	}
+
+	*O_dqpp = NULL;
+	ASSERT(XFS_DQ_IS_HASH_LOCKED(qh));
+	return (1);
+}
+
+/*
+ * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
+ * a locked dquot, doing an allocation (if requested) as needed.
+ * When both an inode and an id are given, the inode's id takes precedence.
+ * That is, if the id changes while we dont hold the ilock inside this
+ * function, the new dquot is returned, not necessarily the one requested
+ * in the id argument.
+ */
+int
+xfs_qm_dqget(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*ip,	  /* locked inode (optional) */
+	xfs_dqid_t	id,	  /* gid or uid, depending on type */
+	uint		type,	  /* UDQUOT or GDQUOT */
+	uint		flags,	  /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
+	xfs_dquot_t	**O_dqpp) /* OUT : locked incore dquot */
+{
+	xfs_dquot_t	*dqp;
+	xfs_dqhash_t	*h;
+	uint		version;
+	int		error;
+
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+	if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
+	    (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
+		return (ESRCH);
+	}
+	h = XFS_DQ_HASH(mp, id, type);
+
+#ifdef DEBUG
+	if (xfs_do_dqerror) {
+		if ((xfs_dqerror_dev == mp->m_dev) &&
+		    (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
+			printk("Returning error in dqget\n");
+			return (EIO);
+		}
+	}
+#endif
+
+ again:
+
+#ifdef DEBUG
+	ASSERT(type == XFS_DQ_USER || type == XFS_DQ_GROUP);
+	if (ip) {
+		ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+		if (type == XFS_DQ_USER)
+			ASSERT(ip->i_udquot == NULL);
+		else
+			ASSERT(ip->i_gdquot == NULL);
+	}
+#endif
+	XFS_DQ_HASH_LOCK(h);
+
+	/*
+	 * Look in the cache (hashtable).
+	 * The chain is kept locked during lookup.
+	 */
+	if (xfs_qm_dqlookup(mp, id, h, O_dqpp) == 0) {
+		XFS_STATS_INC(xfsstats.xs_qm_dqcachehits);
+		/*
+		 * The dquot was found, moved to the front of the chain,
+		 * taken off the freelist if it was on it, and locked
+		 * at this point. Just unlock the hashchain and return.
+		 */
+		ASSERT(*O_dqpp);
+		ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
+		XFS_DQ_HASH_UNLOCK(h);
+		xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)");
+		return (0);	/* success */
+	}
+	XFS_STATS_INC(xfsstats.xs_qm_dqcachemisses);
+
+	/*
+	 * Dquot cache miss. We don't want to keep the inode lock across
+	 * a (potential) disk read. Also we don't want to deal with the lock
+	 * ordering between quotainode and this inode. OTOH, dropping the inode
+	 * lock here means dealing with a chown that can happen before
+	 * we re-acquire the lock.
+	 */
+	if (ip)
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	/*
+	 * Save the hashchain version stamp, and unlock the chain, so that
+	 * we don't keep the lock across a disk read
+	 */
+	version = h->qh_version;
+	XFS_DQ_HASH_UNLOCK(h);
+
+	/*
+	 * Allocate the dquot on the kernel heap, and read the ondisk
+	 * portion off the disk. Also, do all the necessary initialization
+	 * This can return ENOENT if dquot didn't exist on disk and we didn't
+	 * ask it to allocate; ESRCH if quotas got turned off suddenly.
+	 */
+	if ((error = xfs_qm_idtodq(mp, id, type,
+				  flags & (XFS_QMOPT_DQALLOC|XFS_QMOPT_DQREPAIR|
+					   XFS_QMOPT_DOWARN),
+				  &dqp))) {
+		if (ip)
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+		return (error);
+	}
+
+	/*
+	 * See if this is mount code calling to look at the overall quota limits
+	 * which are stored in the id == 0 user or group's dquot.
+	 * Since we may not have done a quotacheck by this point, just return
+	 * the dquot without attaching it to any hashtables, lists, etc, or even
+	 * taking a reference.
+	 * The caller must dqdestroy this once done.
+	 */
+	if (flags & XFS_QMOPT_DQSUSER) {
+		ASSERT(id == 0);
+		ASSERT(! ip);
+		goto dqret;
+	}
+
+	/*
+	 * Dquot lock comes after hashlock in the lock ordering
+	 */
+	ASSERT(! XFS_DQ_IS_LOCKED(dqp));
+	if (ip) {
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		if (! XFS_IS_DQTYPE_ON(mp, type)) {
+			/* inode stays locked on return */
+			xfs_qm_dqdestroy(dqp);
+			return XFS_ERROR(ESRCH);
+		}
+		/*
+		 * A dquot could be attached to this inode by now, since
+		 * we had dropped the ilock.
+		 */
+		if (type == XFS_DQ_USER) {
+			if (ip->i_udquot) {
+				xfs_qm_dqdestroy(dqp);
+				dqp = ip->i_udquot;
+				xfs_dqlock(dqp);
+				goto dqret;
+			}
+		} else {
+			if (ip->i_gdquot) {
+				xfs_qm_dqdestroy(dqp);
+				dqp = ip->i_gdquot;
+				xfs_dqlock(dqp);
+				goto dqret;
+			}
+		}
+	}
+
+	/*
+	 * Hashlock comes after ilock in lock order
+	 */
+	XFS_DQ_HASH_LOCK(h);
+	if (version != h->qh_version) {
+		xfs_dquot_t *tmpdqp;
+		/*
+		 * Now, see if somebody else put the dquot in the
+		 * hashtable before us. This can happen because we didn't
+		 * keep the hashchain lock. We don't have to worry about
+		 * lock order between the two dquots here since dqp isn't
+		 * on any findable lists yet.
+		 */
+		if (xfs_qm_dqlookup(mp, id, h, &tmpdqp) == 0) {
+			/*
+			 * Duplicate found. Just throw away the new dquot
+			 * and start over.
+			 */
+			xfs_qm_dqput(tmpdqp);
+			XFS_DQ_HASH_UNLOCK(h);
+			xfs_qm_dqdestroy(dqp);
+			XFS_STATS_INC(xfsstats.xs_qm_dquot_dups);
+			goto again;
+		}
+	}
+
+	/*
+	 * Put the dquot at the beginning of the hash-chain and mp's list
+	 * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
+	 */
+	ASSERT(XFS_DQ_IS_HASH_LOCKED(h));
+	dqp->q_hash = h;
+	XQM_HASHLIST_INSERT(h, dqp);
+
+	/*
+	 * Attach this dquot to this filesystem's list of all dquots,
+	 * kept inside the mount structure in m_quotainfo field
+	 */
+	xfs_qm_mplist_lock(mp);
+
+	/*
+	 * We return a locked dquot to the caller, with a reference taken
+	 */
+	xfs_dqlock(dqp);
+	dqp->q_nrefs = 1;
+
+	XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp);
+
+	xfs_qm_mplist_unlock(mp);
+	XFS_DQ_HASH_UNLOCK(h);
+ dqret:
+	ASSERT((ip == NULL) || XFS_ISLOCKED_INODE_EXCL(ip));
+	xfs_dqtrace_entry(dqp, "DQGET DONE");
+	*O_dqpp = dqp;
+	return (0);
+}
+
+
+/*
+ * Release a reference to the dquot (decrement ref-count)
+ * and unlock it. If there is a group quota attached to this
+ * dquot, carefully release that too without tripping over
+ * deadlocks'n'stuff.
+ */
+void
+xfs_qm_dqput(
+	xfs_dquot_t	*dqp)
+{
+	xfs_dquot_t	*gdqp;
+
+	ASSERT(dqp->q_nrefs > 0);
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	xfs_dqtrace_entry(dqp, "DQPUT");
+
+	if (dqp->q_nrefs != 1) {
+		dqp->q_nrefs--;
+		xfs_dqunlock(dqp);
+		return;
+	}
+
+	/*
+	 * drop the dqlock and acquire the freelist and dqlock
+	 * in the right order; but try to get it out-of-order first
+	 */
+	if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) {
+		xfs_dqtrace_entry(dqp, "DQPUT: FLLOCK-WAIT");
+		xfs_dqunlock(dqp);
+		xfs_qm_freelist_lock(xfs_Gqm);
+		xfs_dqlock(dqp);
+	}
+
+	while (1) {
+		gdqp = NULL;
+
+		/* We can't depend on nrefs being == 1 here */
+		if (--dqp->q_nrefs == 0) {
+			xfs_dqtrace_entry(dqp, "DQPUT: ON FREELIST");
+			/*
+			 * insert at end of the freelist.
+			 */
+			XQM_FREELIST_INSERT(&(xfs_Gqm->qm_dqfreelist), dqp);
+
+			/*
+			 * If we just added a udquot to the freelist, then
+			 * we want to release the gdquot reference that
+			 * it (probably) has. Otherwise it'll keep the
+			 * gdquot from getting reclaimed.
+			 */
+			if ((gdqp = dqp->q_gdquot)) {
+				/*
+				 * Avoid a recursive dqput call
+				 */
+				xfs_dqlock(gdqp);
+				dqp->q_gdquot = NULL;
+			}
+
+			/* xfs_qm_freelist_print(&(xfs_Gqm->qm_dqfreelist),
+			   "@@@@@++ Free list (after append) @@@@@+");
+			   */
+		}
+		xfs_dqunlock(dqp);
+
+		/*
+		 * If we had a group quota inside the user quota as a hint,
+		 * release it now.
+		 */
+		if (! gdqp)
+			break;
+		dqp = gdqp;
+	}
+	xfs_qm_freelist_unlock(xfs_Gqm);
+}
+
+/*
+ * Release a dquot. Flush it if dirty, then dqput() it.
+ * dquot must not be locked.
+ */
+void
+xfs_qm_dqrele(
+	xfs_dquot_t	*dqp)
+{
+	ASSERT(dqp);
+	xfs_dqtrace_entry(dqp, "DQRELE");
+
+	xfs_dqlock(dqp);
+	/*
+	 * We don't care to flush it if the dquot is dirty here.
+	 * That will create stutters that we want to avoid.
+	 * Instead we do a delayed write when we try to reclaim
+	 * a dirty dquot. Also xfs_sync will take part of the burden...
+	 */
+	xfs_qm_dqput(dqp);
+}
+
+
+/*
+ * Write a modified dquot to disk.
+ * The dquot must be locked and the flush lock too taken by caller.
+ * The flush lock will not be unlocked until the dquot reaches the disk,
+ * but the dquot is free to be unlocked and modified by the caller
+ * in the interim. Dquot is still locked on return. This behavior is
+ * identical to that of inodes.
+ */
+int
+xfs_qm_dqflush(
+	xfs_dquot_t		*dqp,
+	uint			flags)
+{
+	xfs_mount_t		*mp;
+	xfs_buf_t		*bp;
+	xfs_disk_dquot_t	*ddqp;
+	int			error;
+	SPLDECL(s);
+
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp));
+	xfs_dqtrace_entry(dqp, "DQFLUSH");
+
+	/*
+	 * If not dirty, nada.
+	 */
+	if (!XFS_DQ_IS_DIRTY(dqp)) {
+		xfs_dqfunlock(dqp);
+		return (0);
+	}
+
+	/*
+	 * Cant flush a pinned dquot. Wait for it.
+	 */
+	xfs_qm_dqunpin_wait(dqp);
+
+	/*
+	 * This may have been unpinned because the filesystem is shutting
+	 * down forcibly. If that's the case we must not write this dquot
+	 * to disk, because the log record didn't make it to disk!
+	 */
+	if (XFS_FORCED_SHUTDOWN(dqp->q_mount)) {
+		dqp->dq_flags &= ~(XFS_DQ_DIRTY);
+		xfs_dqfunlock(dqp);
+		return XFS_ERROR(EIO);
+	}
+
+	/*
+	 * Get the buffer containing the on-disk dquot
+	 * We don't need a transaction envelope because we know that the
+	 * the ondisk-dquot has already been allocated for.
+	 */
+	if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) {
+		xfs_dqtrace_entry(dqp, "DQTOBP FAIL");
+		ASSERT(error != ENOENT);
+		/*
+		 * Quotas could have gotten turned off (ESRCH)
+		 */
+		xfs_dqfunlock(dqp);
+		return (error);
+	}
+
+	if (xfs_qm_dqcheck(&dqp->q_core, INT_GET(ddqp->d_id, ARCH_CONVERT), 0, XFS_QMOPT_DOWARN,
+			   "dqflush (incore copy)")) {
+		xfs_force_shutdown(dqp->q_mount, XFS_CORRUPT_INCORE);
+		return XFS_ERROR(EIO);
+	}
+
+	/* This is the only portion of data that needs to persist */
+	bcopy(&(dqp->q_core), ddqp, sizeof(xfs_disk_dquot_t));
+
+	/*
+	 * Clear the dirty field and remember the flush lsn for later use.
+	 */
+	dqp->dq_flags &= ~(XFS_DQ_DIRTY);
+	mp = dqp->q_mount;
+
+	/* lsn is 64 bits */
+	AIL_LOCK(mp, s);
+	dqp->q_logitem.qli_flush_lsn = dqp->q_logitem.qli_item.li_lsn;
+	AIL_UNLOCK(mp, s);
+
+	/*
+	 * Attach an iodone routine so that we can remove this dquot from the
+	 * AIL and release the flush lock once the dquot is synced to disk.
+	 */
+	xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t *, xfs_log_item_t *))
+			      xfs_qm_dqflush_done, &(dqp->q_logitem.qli_item));
+	/*
+	 * If the buffer is pinned then push on the log so we won't
+	 * get stuck waiting in the write for too long.
+	 */
+	if (XFS_BUF_ISPINNED(bp)) {
+		xfs_dqtrace_entry(dqp, "DQFLUSH LOG FORCE");
+		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+	}
+
+	if (flags & XFS_QMOPT_DELWRI) {
+		xfs_bdwrite(mp, bp);
+	} else if (flags & XFS_QMOPT_ASYNC) {
+		xfs_bawrite(mp, bp);
+	} else {
+		error = xfs_bwrite(mp, bp);
+	}
+	xfs_dqtrace_entry(dqp, "DQFLUSH END");
+	/*
+	 * dqp is still locked, but caller is free to unlock it now.
+	 */
+	return (error);
+
+}
+
+/*
+ * This is the dquot flushing I/O completion routine.  It is called
+ * from interrupt level when the buffer containing the dquot is
+ * flushed to disk.  It is responsible for removing the dquot logitem
+ * from the AIL if it has not been re-logged, and unlocking the dquot's
+ * flush lock. This behavior is very similar to that of inodes..
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_qm_dqflush_done(
+	xfs_buf_t		*bp,
+	xfs_dq_logitem_t	*qip)
+{
+	xfs_dquot_t		*dqp;
+	SPLDECL(s);
+
+	dqp = qip->qli_dquot;
+
+	/*
+	 * We only want to pull the item from the AIL if its
+	 * location in the log has not changed since we started the flush.
+	 * Thus, we only bother if the dquot's lsn has
+	 * not changed. First we check the lsn outside the lock
+	 * since it's cheaper, and then we recheck while
+	 * holding the lock before removing the dquot from the AIL.
+	 */
+	if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
+	    qip->qli_item.li_lsn == qip->qli_flush_lsn) {
+
+		AIL_LOCK(dqp->q_mount, s);
+		/*
+		 * xfs_trans_delete_ail() drops the AIL lock.
+		 */
+		if (qip->qli_item.li_lsn == qip->qli_flush_lsn)
+			xfs_trans_delete_ail(dqp->q_mount,
+					     (xfs_log_item_t*)qip, s);
+		else
+			AIL_UNLOCK(dqp->q_mount, s);
+	}
+
+	/*
+	 * Release the dq's flush lock since we're done with it.
+	 */
+	xfs_dqfunlock(dqp);
+}
+
+
+int
+xfs_qm_dqflock_nowait(
+	xfs_dquot_t *dqp)
+{
+	int locked;
+
+	locked = cpsema(&((dqp)->q_flock));
+
+	/* XXX ifdef these out */
+	if (locked)
+		(dqp)->dq_flags |= XFS_DQ_FLOCKED;
+	return (locked);
+}
+
+
+int
+xfs_qm_dqlock_nowait(
+	xfs_dquot_t *dqp)
+{
+	return (mutex_trylock(&((dqp)->q_qlock)));
+}
+
+void
+xfs_dqlock(
+	xfs_dquot_t *dqp)
+{
+	mutex_lock(&(dqp->q_qlock), PINOD);
+}
+
+void
+xfs_dqunlock(
+	xfs_dquot_t *dqp)
+{
+	mutex_unlock(&(dqp->q_qlock));
+	if (dqp->q_logitem.qli_dquot == dqp) {
+		/* Once was dqp->q_mount, but might just have been cleared */
+		xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_mountp,
+					(xfs_log_item_t*)&(dqp->q_logitem));
+	}
+}
+
+
+void
+xfs_dqunlock_nonotify(
+	xfs_dquot_t *dqp)
+{
+	mutex_unlock(&(dqp->q_qlock));
+}
+
+void
+xfs_dqlock2(
+	xfs_dquot_t	*d1,
+	xfs_dquot_t	*d2)
+{
+	if (d1 && d2) {
+		ASSERT(d1 != d2);
+		if (INT_GET(d1->q_core.d_id, ARCH_CONVERT) > INT_GET(d2->q_core.d_id, ARCH_CONVERT)) {
+			xfs_dqlock(d2);
+			xfs_dqlock(d1);
+		} else {
+			xfs_dqlock(d1);
+			xfs_dqlock(d2);
+		}
+	} else {
+		if (d1) {
+			xfs_dqlock(d1);
+		} else if (d2) {
+			xfs_dqlock(d2);
+		}
+	}
+}
+
+
+/*
+ * A rarely used accessor. This exists because we don't really want
+ * to expose the internals of a dquot to the outside world.
+ */
+xfs_dqid_t
+xfs_qm_dqid(
+	xfs_dquot_t	*dqp)
+{
+	return (INT_GET(dqp->q_core.d_id, ARCH_CONVERT));
+}
+
+
+/*
+ * Take a dquot out of the mount's dqlist as well as the hashlist.
+ * This is called via unmount as well as quotaoff, and the purge
+ * will always succeed unless there are soft (temp) references
+ * outstanding.
+ *
+ * This returns 0 if it was purged, 1 if it wasn't. It's not an error code
+ * that we're returning! XXXsup - not cool.
+ */
+/* ARGSUSED */
+int
+xfs_qm_dqpurge(
+	xfs_dquot_t	*dqp,
+	uint		flags)
+{
+	xfs_dqhash_t	*thishash;
+	xfs_mount_t	*mp;
+
+	mp = dqp->q_mount;
+
+	ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+	ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash));
+
+	xfs_dqlock(dqp);
+	/*
+	 * We really can't afford to purge a dquot that is
+	 * referenced, because these are hard refs.
+	 * It shouldn't happen in general because we went thru _all_ inodes in
+	 * dqrele_all_inodes before calling this and didn't let the mountlock go.
+	 * However it is possible that we have dquots with temporary
+	 * references that are not attached to an inode. e.g. see xfs_setattr().
+	 */
+	if (dqp->q_nrefs != 0) {
+		xfs_dqunlock(dqp);
+		XFS_DQ_HASH_UNLOCK(dqp->q_hash);
+		return (1);
+	}
+
+	ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
+
+	/*
+	 * If we're turning off quotas, we have to make sure that, for
+	 * example, we don't delete quota disk blocks while dquots are
+	 * in the process of getting written to those disk blocks.
+	 * This dquot might well be on AIL, and we can't leave it there
+	 * if we're turning off quotas. Basically, we need this flush
+	 * lock, and are willing to block on it.
+	 */
+	if (! xfs_qm_dqflock_nowait(dqp)) {
+		/*
+		 * Block on the flush lock after nudging dquot buffer,
+		 * if it is incore.
+		 */
+		xfs_qm_dqflock_pushbuf_wait(dqp);
+	}
+
+	/*
+	 * XXXIf we're turning this type of quotas off, we don't care
+	 * about the dirty metadata sitting in this dquot. OTOH, if
+	 * we're unmounting, we do care, so we flush it and wait.
+	 */
+	if (XFS_DQ_IS_DIRTY(dqp)) {
+		xfs_dqtrace_entry(dqp, "DQPURGE ->DQFLUSH: DQDIRTY");
+		/* dqflush unlocks dqflock */
+		/*
+		 * Given that dqpurge is a very rare occurence, it is OK
+		 * that we're holding the hashlist and mplist locks
+		 * across the disk write. But, ... XXXsup
+		 *
+		 * We don't care about getting disk errors here. We need
+		 * to purge this dquot anyway, so we go ahead regardless.
+		 */
+		(void) xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC);
+		xfs_dqflock(dqp);
+	}
+	ASSERT(dqp->q_pincount == 0);
+	ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
+	       !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
+
+	thishash = dqp->q_hash;
+	XQM_HASHLIST_REMOVE(thishash, dqp);
+	XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(mp)), dqp);
+	/*
+	 * XXX Move this to the front of the freelist, if we can get the
+	 * freelist lock.
+	 */
+	ASSERT(XFS_DQ_IS_ON_FREELIST(dqp));
+
+	dqp->q_mount = NULL;;
+	dqp->q_hash = NULL;
+	dqp->dq_flags = XFS_DQ_INACTIVE;
+	bzero(&dqp->q_core, sizeof(dqp->q_core));
+	xfs_dqfunlock(dqp);
+	xfs_dqunlock(dqp);
+	XFS_DQ_HASH_UNLOCK(thishash);
+	return (0);
+}
+
+
+/*
+ * Do some primitive error checking on ondisk dquot
+ * data structures. Not just for debugging, actually;
+ * this can be useful for detecting data corruption mainly due to
+ * disk failures.
+ */
+/* ARGSUSED */
+int
+xfs_qm_dqcheck(
+	xfs_disk_dquot_t *ddq,
+	xfs_dqid_t	 id,
+	uint		 type,	  /* used only when IO_dorepair is true */
+	uint		 flags,
+	char		 *str)
+{
+	int errs;
+
+	errs = 0;
+	/* ASSERT(flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN)); */
+	/*
+	 * We can encounter an uninitialized dquot buffer for 2 reasons:
+	 * 1. If we crash while deleting the quotainode(s), and those blks get used
+	 *    for some user data. This is because we take the path of regular
+	 *    file deletion; however, the size field of quotainodes is never
+	 *    updated, so all the tricks that we play in itruncate_finish
+	 *    don't quite matter.
+	 *
+	 * 2. We don't play the quota buffers when there's a quotaoff logitem.
+	 *    But the allocation will be replayed so we'll end up with an
+	 *    uninitialized quota block.
+	 *
+	 * This is all fine; things are still consistent, and we haven't lost
+	 * any quota information. Just don't complain about bad dquot blks.
+	 */
+	if (INT_GET(ddq->d_magic, ARCH_CONVERT) != XFS_DQUOT_MAGIC) {
+		if (flags & XFS_QMOPT_DOWARN)
+			cmn_err(CE_ALERT,
+			"%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
+			str, id, INT_GET(ddq->d_magic, ARCH_CONVERT), XFS_DQUOT_MAGIC);
+		errs++;
+	}
+	if (INT_GET(ddq->d_version, ARCH_CONVERT) != XFS_DQUOT_VERSION) {
+		if (flags & XFS_QMOPT_DOWARN)
+			cmn_err(CE_ALERT,
+			"%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
+			str, id, INT_GET(ddq->d_magic, ARCH_CONVERT), XFS_DQUOT_VERSION);
+		errs++;
+	}
+
+	if (INT_GET(ddq->d_flags, ARCH_CONVERT) != XFS_DQ_USER && INT_GET(ddq->d_flags, ARCH_CONVERT) != XFS_DQ_GROUP) {
+		if (flags & XFS_QMOPT_DOWARN)
+			cmn_err(CE_ALERT,
+			"%s : XFS dquot ID 0x%x, unknown flags 0x%x",
+			str, id, INT_GET(ddq->d_flags, ARCH_CONVERT));
+		errs++;
+	}
+
+	if (id != -1 && id != INT_GET(ddq->d_id, ARCH_CONVERT)) {
+		if (flags & XFS_QMOPT_DOWARN)
+			cmn_err(CE_ALERT,
+			"%s : ondisk-dquot 0x%x, ID mismatch: "
+			"0x%x expected, found id 0x%x",
+			str, ddq, id, INT_GET(ddq->d_id, ARCH_CONVERT));
+		errs++;
+	}
+
+	if (! errs) {
+		if (INT_GET(ddq->d_blk_softlimit, ARCH_CONVERT) &&
+		    INT_GET(ddq->d_bcount, ARCH_CONVERT) >= INT_GET(ddq->d_blk_softlimit, ARCH_CONVERT)) {
+			if (INT_ISZERO(ddq->d_btimer, ARCH_CONVERT) && !INT_ISZERO(ddq->d_id, ARCH_CONVERT)) {
+				if (flags & XFS_QMOPT_DOWARN)
+					cmn_err(CE_ALERT,
+					"%s : Dquot ID 0x%x (0x%x) "
+					"BLK TIMER NOT STARTED",
+					str, (int) INT_GET(ddq->d_id, ARCH_CONVERT), ddq);
+				errs++;
+			}
+		}
+		if (INT_GET(ddq->d_ino_softlimit, ARCH_CONVERT) &&
+		    INT_GET(ddq->d_icount, ARCH_CONVERT) >= INT_GET(ddq->d_ino_softlimit, ARCH_CONVERT)) {
+			if (INT_ISZERO(ddq->d_itimer, ARCH_CONVERT) && !INT_ISZERO(ddq->d_id, ARCH_CONVERT)) {
+				if (flags & XFS_QMOPT_DOWARN)
+					cmn_err(CE_ALERT,
+					"%s : Dquot ID 0x%x (0x%x) "
+					"INODE TIMER NOT STARTED",
+					str, (int) INT_GET(ddq->d_id, ARCH_CONVERT), ddq);
+				errs++;
+			}
+		}
+	}
+
+	if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
+		return (errs);
+
+	if (flags & XFS_QMOPT_DOWARN)
+		cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id);
+
+	/*
+	 * Typically, a repair is only requested by quotacheck.
+	 */
+	ASSERT(id != -1);
+	ASSERT(flags & XFS_QMOPT_DQREPAIR);
+	bzero(ddq, sizeof(xfs_dqblk_t));
+	xfs_qm_dqinit_core(id, type, (xfs_dqblk_t *)ddq);
+	return (errs);
+}
+
+#ifdef QUOTADEBUG
+void
+xfs_qm_dqprint(xfs_dquot_t *dqp)
+{
+	printk( "-----------KERNEL DQUOT----------------\n");
+	printk( "---- dquot ID	=  %d\n", (int) INT_GET(dqp->q_core.d_id, ARCH_CONVERT));
+	printk( "---- type	=  %s\n", XFS_QM_ISUDQ(dqp) ? "USR" : "GRP");
+	printk( "---- fs	=  0x%p\n", dqp->q_mount);
+	printk( "---- blkno	=  0x%x\n", (int) dqp->q_blkno);
+	printk( "---- boffset	=  0x%x\n", (int) dqp->q_bufoffset);
+	printk( "---- blkhlimit =  %Lu (0x%x)\n",
+	       INT_GET(dqp->q_core.d_blk_hardlimit, ARCH_CONVERT),
+	       (int) INT_GET(dqp->q_core.d_blk_hardlimit, ARCH_CONVERT));
+	printk( "---- blkslimit =  %Lu (0x%x)\n",
+	       INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT),
+	       (int)INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT));
+	printk( "---- inohlimit =  %Lu (0x%x)\n",
+	       INT_GET(dqp->q_core.d_ino_hardlimit, ARCH_CONVERT),
+	       (int)INT_GET(dqp->q_core.d_ino_hardlimit, ARCH_CONVERT));
+	printk( "---- inoslimit =  %Lu (0x%x)\n",
+	       INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT),
+	       (int)INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT));
+	printk( "---- bcount	=  %Lu (0x%x)\n",
+	       INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT),
+	       (int)INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT));
+	printk( "---- icount	=  %Lu (0x%x)\n",
+	       INT_GET(dqp->q_core.d_icount, ARCH_CONVERT),
+	       (int)INT_GET(dqp->q_core.d_icount, ARCH_CONVERT));
+	printk( "---- btimer	=  %d\n", (int)INT_GET(dqp->q_core.d_btimer, ARCH_CONVERT));
+	printk( "---- itimer	=  %d\n", (int)INT_GET(dqp->q_core.d_itimer, ARCH_CONVERT));
+
+	printk( "---------------------------\n");
+}
+#endif
+
+/*
+ * Give the buffer a little push if it is incore and
+ * wait on the flush lock.
+ */
+void
+xfs_qm_dqflock_pushbuf_wait(
+	xfs_dquot_t	*dqp)
+{
+	xfs_buf_t	*bp;
+
+	/*
+	 * Check to see if the dquot has been flushed delayed
+	 * write.  If so, grab its buffer and send it
+	 * out immediately.  We'll be able to acquire
+	 * the flush lock when the I/O completes.
+	 */
+	bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno,
+		    XFS_QI_DQCHUNKLEN(dqp->q_mount),
+		    XFS_INCORE_TRYLOCK);
+	if (bp != NULL) {
+		if (XFS_BUF_ISDELAYWRITE(bp)) {
+			if (XFS_BUF_ISPINNED(bp)) {
+				xfs_log_force(dqp->q_mount,
+					      (xfs_lsn_t)0,
+					      XFS_LOG_FORCE);
+			}
+			xfs_bawrite(dqp->q_mount, bp);
+		} else {
+			xfs_buf_relse(bp);
+		}
+	}
+	xfs_dqflock(dqp);
+}
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
new file mode 100644
index 000000000000..455830aee721
--- /dev/null
+++ b/fs/xfs/xfs_dquot.h
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DQUOT_H__
+#define __XFS_DQUOT_H__
+
+/*
+ * Dquots are structures that hold quota information about a user or a group,
+ * much like inodes are for files. In fact, dquots share many characteristics
+ * with inodes. However, dquots can also be a centralized resource, relative
+ * to a collection of inodes. In this respect, dquots share some characteristics
+ * of the superblock.
+ * XFS dquots exploit both those in its algorithms. They make every attempt
+ * to not be a bottleneck when quotas are on and have minimal impact, if any,
+ * when quotas are off.
+ */
+
+/*
+ * The hash chain headers (hash buckets)
+ */
+typedef struct xfs_dqhash {
+	struct xfs_dquot *qh_next;
+	mutex_t		  qh_lock;
+	uint		  qh_version;	/* ever increasing version */
+	uint		  qh_nelems;	/* number of dquots on the list */
+} xfs_dqhash_t;
+
+typedef struct xfs_dqlink {
+	struct xfs_dquot  *ql_next;	/* forward link */
+	struct xfs_dquot **ql_prevp;	/* pointer to prev ql_next */
+} xfs_dqlink_t;
+
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * This is the marker which is designed to occupy the first few
+ * bytes of the xfs_dquot_t structure. Even inside this, the freelist pointers
+ * must come first.
+ * This serves as the marker ("sentinel") when we have to restart list
+ * iterations because of locking considerations.
+ */
+typedef struct xfs_dqmarker {
+	struct xfs_dquot*dqm_flnext;	/* link to freelist: must be first */
+	struct xfs_dquot*dqm_flprev;
+	xfs_dqlink_t	 dqm_mplist;	/* link to mount's list of dquots */
+	xfs_dqlink_t	 dqm_hashlist;	/* link to the hash chain */
+	uint		 dqm_flags;	/* various flags (XFS_DQ_*) */
+} xfs_dqmarker_t;
+
+/*
+ * The incore dquot structure
+ */
+typedef struct xfs_dquot {
+	xfs_dqmarker_t	 q_lists;	/* list ptrs, q_flags (marker) */
+	xfs_dqhash_t	*q_hash;	/* the hashchain header */
+	struct xfs_mount*q_mount;	/* filesystem this relates to */
+	struct xfs_trans*q_transp;	/* trans this belongs to currently */
+	uint		 q_nrefs;	/* # active refs from inodes */
+	xfs_daddr_t	 q_blkno;	/* blkno of dquot buffer */
+	int		 q_bufoffset;	/* off of dq in buffer (# dquots) */
+	xfs_fileoff_t	 q_fileoffset;	/* offset in quotas file */
+
+	struct xfs_dquot*q_gdquot;	/* group dquot, hint only */
+	xfs_disk_dquot_t q_core;	/* actual usage & quotas */
+	xfs_dq_logitem_t q_logitem;	/* dquot log item */
+	xfs_qcnt_t	 q_res_bcount;	/* total regular nblks used+reserved */
+	xfs_qcnt_t	 q_res_icount;	/* total inos allocd+reserved */
+	xfs_qcnt_t	 q_res_rtbcount;/* total realtime blks used+reserved */
+	mutex_t		 q_qlock;	/* quota lock */
+	sema_t		 q_flock;	/* flush lock */
+	uint		 q_pincount;	/* pin count for this dquot */
+	sv_t		 q_pinwait;	/* sync var for pinning */
+#ifdef DQUOT_TRACING
+	struct ktrace	*q_trace;	/* trace header structure */
+#endif
+} xfs_dquot_t;
+
+
+#define dq_flnext	q_lists.dqm_flnext
+#define dq_flprev	q_lists.dqm_flprev
+#define dq_mplist	q_lists.dqm_mplist
+#define dq_hashlist	q_lists.dqm_hashlist
+#define dq_flags	q_lists.dqm_flags
+
+#define XFS_DQHOLD(dqp)		((dqp)->q_nrefs++)
+
+/*
+ * Quota Accounting flags
+ */
+#define XFS_ALL_QUOTA_ACCT	(XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT)
+#define XFS_ALL_QUOTA_ENFD	(XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD)
+#define XFS_ALL_QUOTA_CHKD	(XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD)
+#define XFS_ALL_QUOTA_ACTV	(XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE)
+#define XFS_ALL_QUOTA_ACCT_ENFD (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
+				 XFS_GQUOTA_ACCT|XFS_GQUOTA_ENFD)
+
+#define XFS_IS_QUOTA_RUNNING(mp)  ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
+#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT)
+#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT)
+
+/*
+ * Quota Limit Enforcement flags
+ */
+#define XFS_IS_QUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_ALL_QUOTA_ENFD)
+#define XFS_IS_UQUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_UQUOTA_ENFD)
+#define XFS_IS_GQUOTA_ENFORCED(mp)	((mp)->m_qflags & XFS_GQUOTA_ENFD)
+
+#ifdef DEBUG
+static inline int
+XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
+{
+	if (mutex_trylock(&dqp->q_qlock)) {
+		mutex_unlock(&dqp->q_qlock);
+		return 0;
+	}
+	return 1;
+}
+#endif
+
+/*
+ * The following three routines simply manage the q_flock
+ * semaphore embedded in the dquot.  This semaphore synchronizes
+ * processes attempting to flush the in-core dquot back to disk.
+ */
+#define xfs_dqflock(dqp)	 { psema(&((dqp)->q_flock), PINOD | PRECALC);\
+				   (dqp)->dq_flags |= XFS_DQ_FLOCKED; }
+#define xfs_dqfunlock(dqp)	 { ASSERT(valusema(&((dqp)->q_flock)) <= 0); \
+				   vsema(&((dqp)->q_flock)); \
+				   (dqp)->dq_flags &= ~(XFS_DQ_FLOCKED); }
+
+#define XFS_DQ_PINLOCK(dqp)	   mutex_spinlock( \
+				     &(XFS_DQ_TO_QINF(dqp)->qi_pinlock))
+#define XFS_DQ_PINUNLOCK(dqp, s)   mutex_spinunlock( \
+				     &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s)
+
+#define XFS_DQ_IS_FLUSH_LOCKED(dqp) (valusema(&((dqp)->q_flock)) <= 0)
+#define XFS_DQ_IS_ON_FREELIST(dqp)  ((dqp)->dq_flnext != (dqp))
+#define XFS_DQ_IS_DIRTY(dqp)	((dqp)->dq_flags & XFS_DQ_DIRTY)
+#define XFS_QM_ISUDQ(dqp)	((dqp)->dq_flags & XFS_DQ_USER)
+#define XFS_DQ_TO_QINF(dqp)	((dqp)->q_mount->m_quotainfo)
+#define XFS_DQ_TO_QIP(dqp)	(XFS_QM_ISUDQ(dqp) ? \
+				 XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
+				 XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
+
+#define XFS_IS_THIS_QUOTA_OFF(d) (! (XFS_QM_ISUDQ(d) ? \
+				     (XFS_IS_UQUOTA_ON((d)->q_mount)) : \
+				     (XFS_IS_GQUOTA_ON((d)->q_mount))))
+#ifdef DQUOT_TRACING
+/*
+ * Dquot Tracing stuff.
+ */
+#define DQUOT_TRACE_SIZE	64
+#define DQUOT_KTRACE_ENTRY	1
+
+#define xfs_dqtrace_entry_ino(a,b,ip) \
+xfs_dqtrace_entry__((a), (b), (void*)__return_address, (ip))
+#define xfs_dqtrace_entry(a,b) \
+xfs_dqtrace_entry__((a), (b), (void*)__return_address, NULL)
+extern void		xfs_dqtrace_entry__(xfs_dquot_t *dqp, char *func,
+					    void *, xfs_inode_t *);
+#else
+#define xfs_dqtrace_entry(a,b)
+#define xfs_dqtrace_entry_ino(a,b,ip)
+#endif
+#ifdef QUOTADEBUG
+extern void		xfs_qm_dqprint(xfs_dquot_t *);
+#else
+#define xfs_qm_dqprint(a)
+#endif
+
+extern xfs_dquot_t	*xfs_qm_dqinit(xfs_mount_t *, xfs_dqid_t, uint);
+extern void		xfs_qm_dqdestroy(xfs_dquot_t *);
+extern int		xfs_qm_dqflush(xfs_dquot_t *, uint);
+extern int		xfs_qm_dqpurge(xfs_dquot_t *, uint);
+extern void		xfs_qm_dqunpin_wait(xfs_dquot_t *);
+extern int		xfs_qm_dqlock_nowait(xfs_dquot_t *);
+extern int		xfs_qm_dqflock_nowait(xfs_dquot_t *);
+extern void		xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp);
+extern void		xfs_qm_adjust_dqtimers(xfs_mount_t *,
+					       xfs_disk_dquot_t *);
+extern int		xfs_qm_dqwarn(xfs_disk_dquot_t *, uint);
+
+#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
new file mode 100644
index 000000000000..bd8d7de4faab
--- /dev/null
+++ b/fs/xfs/xfs_dquot_item.c
@@ -0,0 +1,675 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <xfs_quota_priv.h>
+
+
+/*
+ * returns the number of iovecs needed to log the given dquot item.
+ */
+/* ARGSUSED */
+STATIC uint
+xfs_qm_dquot_logitem_size(
+	xfs_dq_logitem_t	*logitem)
+{
+	/*
+	 * we need only two iovecs, one for the format, one for the real thing
+	 */
+	return (2);
+}
+
+/*
+ * fills in the vector of log iovecs for the given dquot log item.
+ */
+STATIC void
+xfs_qm_dquot_logitem_format(
+	xfs_dq_logitem_t	*logitem,
+	xfs_log_iovec_t		*logvec)
+{
+	ASSERT(logitem);
+	ASSERT(logitem->qli_dquot);
+
+	logvec->i_addr = (xfs_caddr_t)&logitem->qli_format;
+	logvec->i_len  = sizeof(xfs_dq_logformat_t);
+	logvec++;
+	logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core;
+	logvec->i_len  = sizeof(xfs_disk_dquot_t);
+
+	ASSERT(2 == logitem->qli_item.li_desc->lid_size);
+	logitem->qli_format.qlf_size = 2;
+
+}
+
+/*
+ * Increment the pin count of the given dquot.
+ * This value is protected by pinlock spinlock in the xQM structure.
+ */
+STATIC void
+xfs_qm_dquot_logitem_pin(
+	xfs_dq_logitem_t *logitem)
+{
+	unsigned long	s;
+	xfs_dquot_t *dqp;
+
+	dqp = logitem->qli_dquot;
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	s = XFS_DQ_PINLOCK(dqp);
+	dqp->q_pincount++;
+	XFS_DQ_PINUNLOCK(dqp, s);
+}
+
+/*
+ * Decrement the pin count of the given dquot, and wake up
+ * anyone in xfs_dqwait_unpin() if the count goes to 0.	 The
+ * dquot must have been previously pinned with a call to xfs_dqpin().
+ */
+STATIC void
+xfs_qm_dquot_logitem_unpin(
+	xfs_dq_logitem_t *logitem)
+{
+	unsigned long	s;
+	xfs_dquot_t *dqp;
+
+	dqp = logitem->qli_dquot;
+	ASSERT(dqp->q_pincount > 0);
+	s = XFS_DQ_PINLOCK(dqp);
+	dqp->q_pincount--;
+	if (dqp->q_pincount == 0) {
+		sv_broadcast(&dqp->q_pinwait);
+	}
+	XFS_DQ_PINUNLOCK(dqp, s);
+}
+
+/* ARGSUSED */
+STATIC void
+xfs_qm_dquot_logitem_unpin_remove(
+	xfs_dq_logitem_t *logitem,
+	xfs_trans_t	 *tp)
+{
+	xfs_qm_dquot_logitem_unpin(logitem);
+}
+
+/*
+ * Given the logitem, this writes the corresponding dquot entry to disk
+ * asynchronously. This is called with the dquot entry securely locked;
+ * we simply get xfs_qm_dqflush() to do the work, and unlock the dquot
+ * at the end.
+ */
+STATIC void
+xfs_qm_dquot_logitem_push(
+	xfs_dq_logitem_t	*logitem)
+{
+	xfs_dquot_t	*dqp;
+
+	dqp = logitem->qli_dquot;
+
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp));
+
+	/*
+	 * Since we were able to lock the dquot's flush lock and
+	 * we found it on the AIL, the dquot must be dirty.  This
+	 * is because the dquot is removed from the AIL while still
+	 * holding the flush lock in xfs_dqflush_done().  Thus, if
+	 * we found it in the AIL and were able to obtain the flush
+	 * lock without sleeping, then there must not have been
+	 * anyone in the process of flushing the dquot.
+	 */
+	xfs_qm_dqflush(dqp, XFS_B_DELWRI);
+	xfs_dqunlock(dqp);
+}
+
+/*ARGSUSED*/
+STATIC xfs_lsn_t
+xfs_qm_dquot_logitem_committed(
+	xfs_dq_logitem_t	*l,
+	xfs_lsn_t		lsn)
+{
+	/*
+	 * We always re-log the entire dquot when it becomes dirty,
+	 * so, the latest copy _is_ the only one that matters.
+	 */
+	return (lsn);
+}
+
+
+/*
+ * This is called to wait for the given dquot to be unpinned.
+ * Most of these pin/unpin routines are plagiarized from inode code.
+ */
+void
+xfs_qm_dqunpin_wait(
+	xfs_dquot_t	*dqp)
+{
+	SPLDECL(s);
+
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	if (dqp->q_pincount == 0) {
+		return;
+	}
+
+	/*
+	 * Give the log a push so we don't wait here too long.
+	 */
+	xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE);
+	s = XFS_DQ_PINLOCK(dqp);
+	if (dqp->q_pincount == 0) {
+		XFS_DQ_PINUNLOCK(dqp, s);
+		return;
+	}
+	sv_wait(&(dqp->q_pinwait), PINOD,
+		&(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s);
+}
+
+/*
+ * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that
+ * the dquot is locked by us, but the flush lock isn't. So, here we are
+ * going to see if the relevant dquot buffer is incore, waiting on DELWRI.
+ * If so, we want to push it out to help us take this item off the AIL as soon
+ * as possible.
+ *
+ * We must not be holding the AIL_LOCK at this point. Calling incore() to
+ * search the buffercache can be a time consuming thing, and AIL_LOCK is a
+ * spinlock.
+ */
+STATIC void
+xfs_qm_dquot_logitem_pushbuf(
+	xfs_dq_logitem_t    *qip)
+{
+	xfs_dquot_t	*dqp;
+	xfs_mount_t	*mp;
+	xfs_buf_t	*bp;
+	uint		dopush;
+
+	dqp = qip->qli_dquot;
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+	/*
+	 * The qli_pushbuf_flag keeps others from
+	 * trying to duplicate our effort.
+	 */
+	ASSERT(qip->qli_pushbuf_flag != 0);
+	ASSERT(qip->qli_push_owner == get_thread_id());
+
+	/*
+	 * If flushlock isn't locked anymore, chances are that the
+	 * inode flush completed and the inode was taken off the AIL.
+	 * So, just get out.
+	 */
+	if ((valusema(&(dqp->q_flock)) > 0)  ||
+	    ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
+		qip->qli_pushbuf_flag = 0;
+		xfs_dqunlock(dqp);
+		return;
+	}
+	mp = dqp->q_mount;
+	bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
+		    XFS_QI_DQCHUNKLEN(mp),
+		    XFS_INCORE_TRYLOCK);
+	if (bp != NULL) {
+		if (XFS_BUF_ISDELAYWRITE(bp)) {
+			dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
+				  (valusema(&(dqp->q_flock)) <= 0));
+			qip->qli_pushbuf_flag = 0;
+			xfs_dqunlock(dqp);
+
+			if (XFS_BUF_ISPINNED(bp)) {
+				xfs_log_force(mp, (xfs_lsn_t)0,
+					      XFS_LOG_FORCE);
+			}
+			if (dopush) {
+#ifdef XFSRACEDEBUG
+				delay_for_intr();
+				delay(300);
+#endif
+				xfs_bawrite(mp, bp);
+			} else {
+				xfs_buf_relse(bp);
+			}
+		} else {
+			qip->qli_pushbuf_flag = 0;
+			xfs_dqunlock(dqp);
+			xfs_buf_relse(bp);
+		}
+		return;
+	}
+
+	qip->qli_pushbuf_flag = 0;
+	xfs_dqunlock(dqp);
+}
+
+/*
+ * This is called to attempt to lock the dquot associated with this
+ * dquot log item.  Don't sleep on the dquot lock or the flush lock.
+ * If the flush lock is already held, indicating that the dquot has
+ * been or is in the process of being flushed, then see if we can
+ * find the dquot's buffer in the buffer cache without sleeping.  If
+ * we can and it is marked delayed write, then we want to send it out.
+ * We delay doing so until the push routine, though, to avoid sleeping
+ * in any device strategy routines.
+ */
+STATIC uint
+xfs_qm_dquot_logitem_trylock(
+	xfs_dq_logitem_t	*qip)
+{
+	xfs_dquot_t		*dqp;
+	uint			retval;
+
+	dqp = qip->qli_dquot;
+	if (dqp->q_pincount > 0)
+		return (XFS_ITEM_PINNED);
+
+	if (! xfs_qm_dqlock_nowait(dqp))
+		return (XFS_ITEM_LOCKED);
+
+	retval = XFS_ITEM_SUCCESS;
+	if (! xfs_qm_dqflock_nowait(dqp)) {
+		/*
+		 * The dquot is already being flushed.	It may have been
+		 * flushed delayed write, however, and we don't want to
+		 * get stuck waiting for that to complete.  So, we want to check
+		 * to see if we can lock the dquot's buffer without sleeping.
+		 * If we can and it is marked for delayed write, then we
+		 * hold it and send it out from the push routine.  We don't
+		 * want to do that now since we might sleep in the device
+		 * strategy routine.  We also don't want to grab the buffer lock
+		 * here because we'd like not to call into the buffer cache
+		 * while holding the AIL_LOCK.
+		 * Make sure to only return PUSHBUF if we set pushbuf_flag
+		 * ourselves.  If someone else is doing it then we don't
+		 * want to go to the push routine and duplicate their efforts.
+		 */
+		if (qip->qli_pushbuf_flag == 0) {
+			qip->qli_pushbuf_flag = 1;
+			ASSERT(qip->qli_format.qlf_blkno == dqp->q_blkno);
+#ifdef DEBUG
+			qip->qli_push_owner = get_thread_id();
+#endif
+			/*
+			 * The dquot is left locked.
+			 */
+			retval = XFS_ITEM_PUSHBUF;
+		} else {
+			retval = XFS_ITEM_FLUSHING;
+			xfs_dqunlock_nonotify(dqp);
+		}
+	}
+
+	ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL);
+	return (retval);
+}
+
+
+/*
+ * Unlock the dquot associated with the log item.
+ * Clear the fields of the dquot and dquot log item that
+ * are specific to the current transaction.  If the
+ * hold flags is set, do not unlock the dquot.
+ */
+STATIC void
+xfs_qm_dquot_logitem_unlock(
+	xfs_dq_logitem_t    *ql)
+{
+	xfs_dquot_t	*dqp;
+
+	ASSERT(ql != NULL);
+	dqp = ql->qli_dquot;
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+	/*
+	 * Clear the transaction pointer in the dquot
+	 */
+	dqp->q_transp = NULL;
+
+	/*
+	 * dquots are never 'held' from getting unlocked at the end of
+	 * a transaction.  Their locking and unlocking is hidden inside the
+	 * transaction layer, within trans_commit. Hence, no LI_HOLD flag
+	 * for the logitem.
+	 */
+	xfs_dqunlock(dqp);
+}
+
+
+/*
+ * The transaction with the dquot locked has aborted.  The dquot
+ * must not be dirty within the transaction.  We simply unlock just
+ * as if the transaction had been cancelled.
+ */
+STATIC void
+xfs_qm_dquot_logitem_abort(
+	xfs_dq_logitem_t    *ql)
+{
+	xfs_qm_dquot_logitem_unlock(ql);
+}
+
+/*
+ * this needs to stamp an lsn into the dquot, I think.
+ * rpc's that look at user dquot's would then have to
+ * push on the dependency recorded in the dquot
+ */
+/* ARGSUSED */
+STATIC void
+xfs_qm_dquot_logitem_committing(
+	xfs_dq_logitem_t	*l,
+	xfs_lsn_t		lsn)
+{
+	return;
+}
+
+
+/*
+ * This is the ops vector for dquots
+ */
+struct xfs_item_ops xfs_dquot_item_ops = {
+	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_size,
+	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+					xfs_qm_dquot_logitem_format,
+	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin,
+	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unpin,
+	.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
+					xfs_qm_dquot_logitem_unpin_remove,
+	.iop_trylock	= (uint(*)(xfs_log_item_t*))
+					xfs_qm_dquot_logitem_trylock,
+	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unlock,
+	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_qm_dquot_logitem_committed,
+	.iop_push	= (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_push,
+	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_abort,
+	.iop_pushbuf	= (void(*)(xfs_log_item_t*))
+					xfs_qm_dquot_logitem_pushbuf,
+	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_qm_dquot_logitem_committing
+};
+
+/*
+ * Initialize the dquot log item for a newly allocated dquot.
+ * The dquot isn't locked at this point, but it isn't on any of the lists
+ * either, so we don't care.
+ */
+void
+xfs_qm_dquot_logitem_init(
+	struct xfs_dquot *dqp)
+{
+	xfs_dq_logitem_t  *lp;
+	lp = &dqp->q_logitem;
+
+	lp->qli_item.li_type = XFS_LI_DQUOT;
+	lp->qli_item.li_ops = &xfs_dquot_item_ops;
+	lp->qli_item.li_mountp = dqp->q_mount;
+	lp->qli_dquot = dqp;
+	lp->qli_format.qlf_type = XFS_LI_DQUOT;
+	lp->qli_format.qlf_id = INT_GET(dqp->q_core.d_id, ARCH_CONVERT);
+	lp->qli_format.qlf_blkno = dqp->q_blkno;
+	lp->qli_format.qlf_len = 1;
+	/*
+	 * This is just the offset of this dquot within its buffer
+	 * (which is currently 1 FSB and probably won't change).
+	 * Hence 32 bits for this offset should be just fine.
+	 * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t))
+	 * here, and recompute it at recovery time.
+	 */
+	lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset;
+}
+
+/*------------------  QUOTAOFF LOG ITEMS  -------------------*/
+
+/*
+ * This returns the number of iovecs needed to log the given quotaoff item.
+ * We only need 1 iovec for an quotaoff item.  It just logs the
+ * quotaoff_log_format structure.
+ */
+/*ARGSUSED*/
+STATIC uint
+xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf)
+{
+	return (1);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given quotaoff log item. We use only 1 iovec, and we point that
+ * at the quotaoff_log_format structure embedded in the quotaoff item.
+ * It is at this point that we assert that all of the extent
+ * slots in the quotaoff item have been filled.
+ */
+STATIC void
+xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t	*qf,
+			   xfs_log_iovec_t	*log_vector)
+{
+	ASSERT(qf->qql_format.qf_type == XFS_LI_QUOTAOFF);
+
+	log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format);
+	log_vector->i_len = sizeof(xfs_qoff_logitem_t);
+	qf->qql_format.qf_size = 1;
+}
+
+
+/*
+ * Pinning has no meaning for an quotaoff item, so just return.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf)
+{
+	return;
+}
+
+
+/*
+ * Since pinning has no meaning for an quotaoff item, unpinning does
+ * not either.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf)
+{
+	return;
+}
+
+/*ARGSUSED*/
+STATIC void
+xfs_qm_qoff_logitem_unpin_remove(xfs_qoff_logitem_t *qf, xfs_trans_t *tp)
+{
+	return;
+}
+
+/*
+ * Quotaoff items have no locking, so just return success.
+ */
+/*ARGSUSED*/
+STATIC uint
+xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf)
+{
+	return XFS_ITEM_LOCKED;
+}
+
+/*
+ * Quotaoff items have no locking or pushing, so return failure
+ * so that the caller doesn't bother with us.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_qm_qoff_logitem_unlock(xfs_qoff_logitem_t *qf)
+{
+	return;
+}
+
+/*
+ * The quotaoff-start-item is logged only once and cannot be moved in the log,
+ * so simply return the lsn at which it's been logged.
+ */
+/*ARGSUSED*/
+STATIC xfs_lsn_t
+xfs_qm_qoff_logitem_committed(xfs_qoff_logitem_t *qf, xfs_lsn_t lsn)
+{
+	return (lsn);
+}
+
+/*
+ * The transaction of which this QUOTAOFF is a part has been aborted.
+ * Just clean up after ourselves.
+ * Shouldn't this never happen in the case of qoffend logitems? XXX
+ */
+STATIC void
+xfs_qm_qoff_logitem_abort(xfs_qoff_logitem_t *qf)
+{
+	kmem_free(qf, sizeof(xfs_qoff_logitem_t));
+}
+
+/*
+ * There isn't much you can do to push on an quotaoff item.  It is simply
+ * stuck waiting for the log to be flushed to disk.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_qm_qoff_logitem_push(xfs_qoff_logitem_t *qf)
+{
+	return;
+}
+
+
+/*ARGSUSED*/
+STATIC xfs_lsn_t
+xfs_qm_qoffend_logitem_committed(
+	xfs_qoff_logitem_t *qfe,
+	xfs_lsn_t lsn)
+{
+	xfs_qoff_logitem_t	*qfs;
+	SPLDECL(s);
+
+	qfs = qfe->qql_start_lip;
+	AIL_LOCK(qfs->qql_item.li_mountp,s);
+	/*
+	 * Delete the qoff-start logitem from the AIL.
+	 * xfs_trans_delete_ail() drops the AIL lock.
+	 */
+	xfs_trans_delete_ail(qfs->qql_item.li_mountp, (xfs_log_item_t *)qfs, s);
+	kmem_free(qfs, sizeof(xfs_qoff_logitem_t));
+	kmem_free(qfe, sizeof(xfs_qoff_logitem_t));
+	return (xfs_lsn_t)-1;
+}
+
+/*
+ * XXX rcc - don't know quite what to do with this.  I think we can
+ * just ignore it.  The only time that isn't the case is if we allow
+ * the client to somehow see that quotas have been turned off in which
+ * we can't allow that to get back until the quotaoff hits the disk.
+ * So how would that happen?  Also, do we need different routines for
+ * quotaoff start and quotaoff end?  I suspect the answer is yes but
+ * to be sure, I need to look at the recovery code and see how quota off
+ * recovery is handled (do we roll forward or back or do something else).
+ * If we roll forwards or backwards, then we need two separate routines,
+ * one that does nothing and one that stamps in the lsn that matters
+ * (truly makes the quotaoff irrevocable).  If we do something else,
+ * then maybe we don't need two.
+ */
+/* ARGSUSED */
+STATIC void
+xfs_qm_qoff_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn)
+{
+	return;
+}
+
+/* ARGSUSED */
+STATIC void
+xfs_qm_qoffend_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn)
+{
+	return;
+}
+
+struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
+	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size,
+	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+					xfs_qm_qoff_logitem_format,
+	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
+	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
+	.iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
+					xfs_qm_qoff_logitem_unpin_remove,
+	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
+	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock,
+	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_qm_qoffend_logitem_committed,
+	.iop_push	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
+	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort,
+	.iop_pushbuf	= NULL,
+	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_qm_qoffend_logitem_committing
+};
+
+/*
+ * This is the ops vector shared by all quotaoff-start log items.
+ */
+struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
+	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size,
+	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+					xfs_qm_qoff_logitem_format,
+	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin,
+	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unpin,
+	.iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*))
+					xfs_qm_qoff_logitem_unpin_remove,
+	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock,
+	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock,
+	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_qm_qoff_logitem_committed,
+	.iop_push	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
+	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort,
+	.iop_pushbuf	= NULL,
+	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_qm_qoff_logitem_committing
+};
+
+/*
+ * Allocate and initialize an quotaoff item of the correct quota type(s).
+ */
+xfs_qoff_logitem_t *
+xfs_qm_qoff_logitem_init(
+	struct xfs_mount *mp,
+	xfs_qoff_logitem_t *start,
+	uint flags)
+{
+	xfs_qoff_logitem_t	*qf;
+
+	qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP);
+
+	qf->qql_item.li_type = XFS_LI_QUOTAOFF;
+	if (start)
+		qf->qql_item.li_ops = &xfs_qm_qoffend_logitem_ops;
+	else
+		qf->qql_item.li_ops = &xfs_qm_qoff_logitem_ops;
+	qf->qql_item.li_mountp = mp;
+	qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
+	qf->qql_format.qf_flags = flags;
+	qf->qql_start_lip = start;
+	return (qf);
+}
diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
new file mode 100644
index 000000000000..a11987e5efc8
--- /dev/null
+++ b/fs/xfs/xfs_dquot_item.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_DQUOT_ITEM_H__
+#define __XFS_DQUOT_ITEM_H__
+
+/*
+ * These are the structures used to lay out dquots and quotaoff
+ * records on the log. Quite similar to those of inodes.
+ */
+
+/*
+ * log format struct for dquots.
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ */
+typedef struct xfs_dq_logformat {
+	__uint16_t		qlf_type;      /* dquot log item type */
+	__uint16_t		qlf_size;      /* size of this item */
+	xfs_dqid_t		qlf_id;	       /* usr/grp id number : 32 bits */
+	__int64_t		qlf_blkno;     /* blkno of dquot buffer */
+	__int32_t		qlf_len;       /* len of dquot buffer */
+	__uint32_t		qlf_boffset;   /* off of dquot in buffer */
+} xfs_dq_logformat_t;
+
+/*
+ * log format struct for QUOTAOFF records.
+ * The first two fields must be the type and size fitting into
+ * 32 bits : log_recovery code assumes that.
+ * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
+ * to the first and ensures that the first logitem is taken out of the AIL
+ * only when the last one is securely committed.
+ */
+typedef struct xfs_qoff_logformat {
+	unsigned short		qf_type;	/* quotaoff log item type */
+	unsigned short		qf_size;	/* size of this item */
+	unsigned int		qf_flags;	/* USR and/or GRP */
+	char			qf_pad[12];	/* padding for future */
+} xfs_qoff_logformat_t;
+
+
+#ifdef __KERNEL__
+
+struct xfs_dquot;
+struct xfs_trans;
+struct xfs_mount;
+typedef struct xfs_dq_logitem {
+	xfs_log_item_t		 qli_item;	   /* common portion */
+	struct xfs_dquot	*qli_dquot;	   /* dquot ptr */
+	xfs_lsn_t		 qli_flush_lsn;	   /* lsn at last flush */
+	unsigned short		 qli_pushbuf_flag; /* one bit used in push_ail */
+#ifdef DEBUG
+	uint64_t		 qli_push_owner;
+#endif
+	xfs_dq_logformat_t	 qli_format;	   /* logged structure */
+} xfs_dq_logitem_t;
+
+
+typedef struct xfs_qoff_logitem {
+	xfs_log_item_t		 qql_item;	/* common portion */
+	struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
+	xfs_qoff_logformat_t	 qql_format;	/* logged structure */
+} xfs_qoff_logitem_t;
+
+
+extern void		   xfs_qm_dquot_logitem_init(struct xfs_dquot *);
+extern xfs_qoff_logitem_t *xfs_qm_qoff_logitem_init(struct xfs_mount *,
+						    xfs_qoff_logitem_t *, uint);
+extern xfs_qoff_logitem_t *xfs_trans_get_qoff_item(struct xfs_trans *,
+						   xfs_qoff_logitem_t *, uint);
+extern void		   xfs_trans_log_quotaoff_item(struct xfs_trans *,
+						       xfs_qoff_logitem_t *);
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_DQUOT_ITEM_H__ */
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
new file mode 100644
index 000000000000..b5ceb0316aba
--- /dev/null
+++ b/fs/xfs/xfs_error.c
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+#ifdef DEBUG
+
+int	xfs_etrap[XFS_ERROR_NTRAP] = {
+	0,
+};
+
+int
+xfs_error_trap(int e)
+{
+	int i;
+
+	if (!e)
+		return 0;
+	for (i = 0; i < XFS_ERROR_NTRAP; i++) {
+		if (xfs_etrap[i] == 0)
+			break;
+		if (e != xfs_etrap[i])
+			continue;
+		cmn_err(CE_NOTE, "xfs_error_trap: error %d", e);
+		debug_stop_all_cpus((void *)-1LL);
+		BUG();
+		break;
+	}
+	return e;
+}
+#endif
+
+#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
+
+int	xfs_etest[XFS_NUM_INJECT_ERROR];
+int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
+char *	xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
+
+void
+xfs_error_test_init(void)
+{
+	bzero(xfs_etest, sizeof(xfs_etest));
+	bzero(xfs_etest_fsid, sizeof(xfs_etest_fsid));
+	bzero(xfs_etest_fsname, sizeof(xfs_etest_fsname));
+}
+
+int
+xfs_error_test(int error_tag, int *fsidp, char *expression,
+	       int line, char *file, unsigned long randfactor)
+{
+	int i;
+	int64_t fsid;
+
+	if (random() % randfactor)
+		return 0;
+
+	bcopy(fsidp, &fsid, sizeof(fsid_t));
+
+	for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
+		if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) {
+			cmn_err(CE_WARN,
+	"Injecting error (%s) at file %s, line %d, on filesystem \"%s\"\n",
+				expression, file, line, xfs_etest_fsname[i]);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+int
+xfs_errortag_add(int error_tag, xfs_mount_t *mp)
+{
+	int i;
+	int len;
+	int64_t fsid;
+
+	bcopy(mp->m_fixedfsid, &fsid, sizeof(fsid_t));
+
+	for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
+		if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) {
+			cmn_err(CE_WARN, "XFS error tag #%d on", error_tag);
+			return 0;
+		}
+	}
+
+	for (i = 0; i < XFS_NUM_INJECT_ERROR; i++)  {
+		if (xfs_etest[i] == 0) {
+			cmn_err(CE_WARN, "Turned on XFS error tag #%d",
+				error_tag);
+			xfs_etest[i] = error_tag;
+			xfs_etest_fsid[i] = fsid;
+			len = strlen(mp->m_fsname);
+			xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP);
+			strcpy(xfs_etest_fsname[i], mp->m_fsname);
+			return 0;
+		}
+	}
+
+	cmn_err(CE_WARN, "error tag overflow, too many turned on");
+
+	return 1;
+}
+
+int
+xfs_errortag_clear(int error_tag, xfs_mount_t *mp)
+{
+	int i;
+	int64_t fsid;
+
+	bcopy(mp->m_fixedfsid, &fsid, sizeof(fsid_t));
+
+	for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
+		if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) {
+			xfs_etest[i] = 0;
+			xfs_etest_fsid[i] = 0LL;
+			kmem_free(xfs_etest_fsname[i],
+				  strlen(xfs_etest_fsname[i]) + 1);
+			xfs_etest_fsname[i] = NULL;
+			cmn_err(CE_WARN, "Cleared XFS error tag #%d",
+				error_tag);
+			return 0;
+		}
+	}
+
+	cmn_err(CE_WARN, "XFS error tag %d not on", error_tag);
+
+	return 1;
+}
+
+int
+xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud)
+{
+	int i;
+	int cleared = 0;
+
+	for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
+		if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) &&
+		     xfs_etest[i] != 0) {
+			cleared = 1;
+			cmn_err(CE_WARN, "Clearing XFS error tag #%d",
+				xfs_etest[i]);
+			xfs_etest[i] = 0;
+			xfs_etest_fsid[i] = 0LL;
+			kmem_free(xfs_etest_fsname[i],
+				  strlen(xfs_etest_fsname[i]) + 1);
+			xfs_etest_fsname[i] = NULL;
+		}
+	}
+
+	if (loud || cleared)
+		cmn_err(CE_WARN,
+			"Cleared all XFS error tags for filesystem \"%s\"",
+			fsname);
+
+	return 0;
+}
+
+int
+xfs_errortag_clearall(xfs_mount_t *mp)
+{
+	int64_t fsid;
+
+	bcopy(mp->m_fixedfsid, &fsid, sizeof(fsid_t));
+
+	return xfs_errortag_clearall_umount(fsid, mp->m_fsname, 1);
+}
+#endif /* DEBUG || INDUCE_IO_ERROR */
+
+static void
+xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap)
+{
+	char	*newfmt;
+	int	len = 16 + mp->m_fsname_len + strlen(fmt);
+
+	newfmt = kmem_alloc(len, KM_SLEEP);
+	sprintf(newfmt, "Filesystem \"%s\": %s", mp->m_fsname, fmt);
+	icmn_err(level, newfmt, ap);
+	kmem_free(newfmt, len);
+}
+
+void
+xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	xfs_fs_vcmn_err(level, mp, fmt, ap);
+	va_end(ap);
+}
+
+void
+xfs_cmn_err(uint64_t panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
+{
+	va_list ap;
+
+	if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
+	    && (level & CE_ALERT)) {
+		level &= ~CE_ALERT;
+		level |= CE_PANIC;
+		cmn_err(CE_ALERT, "Transforming an alert into a panic.");
+	}
+	va_start(ap, fmt);
+	xfs_fs_vcmn_err(level, mp, fmt, ap);
+	va_end(ap);
+}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
new file mode 100644
index 000000000000..cfe82bee1c3c
--- /dev/null
+++ b/fs/xfs/xfs_error.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ERROR_H__
+#define __XFS_ERROR_H__
+
+#define prdev(fmt,dev,args...) \
+	printk("XFS: device 0x%x- " fmt "\n", dev, ## args)
+
+#define XFS_ERECOVER	1	/* Failure to recover log */
+#define XFS_ELOGSTAT	2	/* Failure to stat log in user space */
+#define XFS_ENOLOGSPACE 3	/* Reservation too large */
+#define XFS_ENOTSUP	4	/* Operation not supported */
+#define XFS_ENOLSN	5	/* Can't find the lsn you asked for */
+#define XFS_ENOTFOUND	6
+#define XFS_ENOTXFS	7	/* Not XFS filesystem */
+
+#ifdef DEBUG
+#define XFS_ERROR_NTRAP 10
+extern int	xfs_etrap[XFS_ERROR_NTRAP];
+extern int	xfs_error_trap(int);
+#define XFS_ERROR(e)	xfs_error_trap(e)
+#else
+#define XFS_ERROR(e)	(e)
+#endif
+
+
+/*
+ * error injection tags - the labels can be anything you want
+ * but each tag should have its own unique number
+ */
+
+#define XFS_ERRTAG_NOERROR				0
+#define XFS_ERRTAG_IFLUSH_1				1
+#define XFS_ERRTAG_IFLUSH_2				2
+#define XFS_ERRTAG_IFLUSH_3				3
+#define XFS_ERRTAG_IFLUSH_4				4
+#define XFS_ERRTAG_IFLUSH_5				5
+#define XFS_ERRTAG_IFLUSH_6				6
+#define XFS_ERRTAG_DA_READ_BUF				7
+#define XFS_ERRTAG_BTREE_CHECK_LBLOCK			8
+#define XFS_ERRTAG_BTREE_CHECK_SBLOCK			9
+#define XFS_ERRTAG_ALLOC_READ_AGF			10
+#define XFS_ERRTAG_IALLOC_READ_AGI			11
+#define XFS_ERRTAG_ITOBP_INOTOBP			12
+#define XFS_ERRTAG_IUNLINK				13
+#define XFS_ERRTAG_IUNLINK_REMOVE			14
+#define XFS_ERRTAG_DIR_INO_VALIDATE			15
+#define XFS_ERRTAG_BULKSTAT_READ_CHUNK			16
+#define XFS_ERRTAG_IODONE_IOERR				17
+#define XFS_ERRTAG_STRATREAD_IOERR			18
+#define XFS_ERRTAG_STRATCMPL_IOERR			19
+#define XFS_ERRTAG_DIOWRITE_IOERR			20
+#define XFS_ERRTAG_MAX					21
+
+/*
+ * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
+ */
+#define XFS_RANDOM_DEFAULT				100
+#define XFS_RANDOM_IFLUSH_1				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_2				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_3				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_4				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_5				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IFLUSH_6				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_DA_READ_BUF				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_BTREE_CHECK_LBLOCK			(XFS_RANDOM_DEFAULT/4)
+#define XFS_RANDOM_BTREE_CHECK_SBLOCK			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_ALLOC_READ_AGF			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IALLOC_READ_AGI			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_ITOBP_INOTOBP			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IUNLINK				XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IUNLINK_REMOVE			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_DIR_INO_VALIDATE			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_BULKSTAT_READ_CHUNK			XFS_RANDOM_DEFAULT
+#define XFS_RANDOM_IODONE_IOERR				(XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_STRATREAD_IOERR			(XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_STRATCMPL_IOERR			(XFS_RANDOM_DEFAULT/10)
+#define XFS_RANDOM_DIOWRITE_IOERR			(XFS_RANDOM_DEFAULT/10)
+
+#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
+extern int	xfs_error_test(int, int *, char *, int, char *, unsigned long);
+void xfs_error_test_init(void);
+
+#define XFS_NUM_INJECT_ERROR				10
+
+#ifdef __ANSI_CPP__
+#define XFS_TEST_ERROR(expr, mp, tag, rf)		\
+	((expr) || \
+	 xfs_error_test((tag), (mp)->m_fixedfsid, #expr, __LINE__, __FILE__, \
+			 (rf)))
+#else
+#define XFS_TEST_ERROR(expr, mp, tag, rf)		\
+	((expr) || \
+	 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
+			(rf)))
+#endif /* __ANSI_CPP__ */
+
+int		xfs_errortag_add(int error_tag, xfs_mount_t *mp);
+int		xfs_errortag_clear(int error_tag, xfs_mount_t *mp);
+
+int		xfs_errortag_clearall(xfs_mount_t *mp);
+int		xfs_errortag_clearall_umount(int64_t fsid, char *fsname,
+						int loud);
+#else
+#define XFS_TEST_ERROR(expr, mp, tag, rf)	(expr)
+#define xfs_errortag_add(tag, mp)		(ENOSYS)
+#define xfs_errortag_clearall(mp)		(ENOSYS)
+#endif /* (DEBUG || INDUCE_IO_ERROR) */
+
+/*
+ * XFS panic tags -- allow a call to xfs_cmn_err() be turned into
+ *			a panic by setting xfs_panic_mask in the
+ *			stune file.
+ */
+#define		XFS_NO_PTAG			0LL
+#define		XFS_PTAG_IFLUSH			0x0000000000000001LL
+#define		XFS_PTAG_LOGRES			0x0000000000000002LL
+#define		XFS_PTAG_AILDELETE		0x0000000000000004LL
+
+struct xfs_mount;
+/* PRINTFLIKE4 */
+void		xfs_cmn_err(uint64_t panic_tag, int level, struct xfs_mount *mp,
+			    char *fmt, ...);
+/* PRINTFLIKE3 */
+void		xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...);
+
+#endif	/* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
new file mode 100644
index 000000000000..051244882029
--- /dev/null
+++ b/fs/xfs/xfs_extfree_item.c
@@ -0,0 +1,655 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * This file contains the implementation of the xfs_efi_log_item
+ * and xfs_efd_log_item items.
+ */
+
+#include <xfs.h>
+
+
+kmem_zone_t	*xfs_efi_zone;
+kmem_zone_t	*xfs_efd_zone;
+
+STATIC void	xfs_efi_item_unlock(xfs_efi_log_item_t *);
+STATIC void	xfs_efi_item_abort(xfs_efi_log_item_t *);
+STATIC void	xfs_efd_item_abort(xfs_efd_log_item_t *);
+
+
+
+/*
+ * This returns the number of iovecs needed to log the given efi item.
+ * We only need 1 iovec for an efi item.  It just logs the efi_log_format
+ * structure.
+ */
+/*ARGSUSED*/
+STATIC uint
+xfs_efi_item_size(xfs_efi_log_item_t *efip)
+{
+	return 1;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given efi log item. We use only 1 iovec, and we point that
+ * at the efi_log_format structure embedded in the efi item.
+ * It is at this point that we assert that all of the extent
+ * slots in the efi item have been filled.
+ */
+STATIC void
+xfs_efi_item_format(xfs_efi_log_item_t	*efip,
+		    xfs_log_iovec_t	*log_vector)
+{
+	uint	size;
+
+	ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents);
+
+	efip->efi_format.efi_type = XFS_LI_EFI;
+
+	size = sizeof(xfs_efi_log_format_t);
+	size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
+	efip->efi_format.efi_size = 1;
+
+	log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format);
+	log_vector->i_len = size;
+	ASSERT(size >= sizeof(xfs_efi_log_format_t));
+}
+
+
+/*
+ * Pinning has no meaning for an efi item, so just return.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efi_item_pin(xfs_efi_log_item_t *efip)
+{
+	return;
+}
+
+
+/*
+ * While EFIs cannot really be pinned, the unpin operation is the
+ * last place at which the EFI is manipulated during a transaction.
+ * Here we coordinate with xfs_efi_cancel() to determine who gets to
+ * free the EFI.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efi_item_unpin(xfs_efi_log_item_t *efip)
+{
+	int		nexts;
+	int		size;
+	xfs_mount_t	*mp;
+	SPLDECL(s);
+
+	mp = efip->efi_item.li_mountp;
+	AIL_LOCK(mp, s);
+	if (efip->efi_flags & XFS_EFI_CANCELED) {
+		/*
+		 * xfs_trans_delete_ail() drops the AIL lock.
+		 */
+		xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s);
+
+		nexts = efip->efi_format.efi_nextents;
+		if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
+			size = sizeof(xfs_efi_log_item_t);
+			size += (nexts - 1) * sizeof(xfs_extent_t);
+			kmem_free(efip, size);
+		} else {
+			kmem_zone_free(xfs_efi_zone, efip);
+		}
+	} else {
+		efip->efi_flags |= XFS_EFI_COMMITTED;
+		AIL_UNLOCK(mp, s);
+	}
+
+	return;
+}
+
+/*
+ * like unpin only we have to also clear the xaction descriptor
+ * pointing the log item if we free the item.  This routine duplicates
+ * unpin because efi_flags is protected by the AIL lock.  Freeing
+ * the descriptor and then calling unpin would force us to drop the AIL
+ * lock which would open up a race condition.
+ */
+STATIC void
+xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp)
+{
+	int		nexts;
+	int		size;
+	xfs_mount_t	*mp;
+	xfs_log_item_desc_t	*lidp;
+	SPLDECL(s);
+
+	mp = efip->efi_item.li_mountp;
+	AIL_LOCK(mp, s);
+	if (efip->efi_flags & XFS_EFI_CANCELED) {
+		/*
+		 * free the xaction descriptor pointing to this item
+		 */
+		lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip);
+		xfs_trans_free_item(tp, lidp);
+		/*
+		 * pull the item off the AIL.
+		 * xfs_trans_delete_ail() drops the AIL lock.
+		 */
+		xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s);
+		/*
+		 * now free the item itself
+		 */
+		nexts = efip->efi_format.efi_nextents;
+		if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
+			size = sizeof(xfs_efi_log_item_t);
+			size += (nexts - 1) * sizeof(xfs_extent_t);
+			kmem_free(efip, size);
+		} else {
+			kmem_zone_free(xfs_efi_zone, efip);
+		}
+	} else {
+		efip->efi_flags |= XFS_EFI_COMMITTED;
+		AIL_UNLOCK(mp, s);
+	}
+
+	return;
+}
+
+/*
+ * Efi items have no locking or pushing.  However, since EFIs are
+ * pulled from the AIL when their corresponding EFDs are committed
+ * to disk, their situation is very similar to being pinned.  Return
+ * XFS_ITEM_PINNED so that the caller will eventually flush the log.
+ * This should help in getting the EFI out of the AIL.
+ */
+/*ARGSUSED*/
+STATIC uint
+xfs_efi_item_trylock(xfs_efi_log_item_t *efip)
+{
+	return XFS_ITEM_PINNED;
+}
+
+/*
+ * Efi items have no locking, so just return.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efi_item_unlock(xfs_efi_log_item_t *efip)
+{
+	if (efip->efi_item.li_flags & XFS_LI_ABORTED)
+		xfs_efi_item_abort(efip);
+	return;
+}
+
+/*
+ * The EFI is logged only once and cannot be moved in the log, so
+ * simply return the lsn at which it's been logged.  The canceled
+ * flag is not paid any attention here.	 Checking for that is delayed
+ * until the EFI is unpinned.
+ */
+/*ARGSUSED*/
+STATIC xfs_lsn_t
+xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
+{
+	return lsn;
+}
+
+/*
+ * This is called when the transaction logging the EFI is aborted.
+ * Free up the EFI and return.	No need to clean up the slot for
+ * the item in the transaction.	 That was done by the unpin code
+ * which is called prior to this routine in the abort/fs-shutdown path.
+ */
+STATIC void
+xfs_efi_item_abort(xfs_efi_log_item_t *efip)
+{
+	int	nexts;
+	int	size;
+
+	nexts = efip->efi_format.efi_nextents;
+	if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
+		size = sizeof(xfs_efi_log_item_t);
+		size += (nexts - 1) * sizeof(xfs_extent_t);
+		kmem_free(efip, size);
+	} else {
+		kmem_zone_free(xfs_efi_zone, efip);
+	}
+	return;
+}
+
+/*
+ * There isn't much you can do to push on an efi item.	It is simply
+ * stuck waiting for all of its corresponding efd items to be
+ * committed to disk.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efi_item_push(xfs_efi_log_item_t *efip)
+{
+	return;
+}
+
+/*
+ * The EFI dependency tracking op doesn't do squat.  It can't because
+ * it doesn't know where the free extent is coming from.  The dependency
+ * tracking has to be handled by the "enclosing" metadata object.  For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efi_item_committing(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
+{
+	return;
+}
+
+/*
+ * This is the ops vector shared by all efi log items.
+ */
+struct xfs_item_ops xfs_efi_item_ops = {
+	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_efi_item_size,
+	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+					xfs_efi_item_format,
+	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_efi_item_pin,
+	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_efi_item_unpin,
+	.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
+					xfs_efi_item_unpin_remove,
+	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock,
+	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_efi_item_unlock,
+	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_efi_item_committed,
+	.iop_push	= (void(*)(xfs_log_item_t*))xfs_efi_item_push,
+	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_efi_item_abort,
+	.iop_pushbuf	= NULL,
+	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_efi_item_committing
+};
+
+
+/*
+ * Allocate and initialize an efi item with the given number of extents.
+ */
+xfs_efi_log_item_t *
+xfs_efi_init(xfs_mount_t	*mp,
+	     uint		nextents)
+
+{
+	xfs_efi_log_item_t	*efip;
+	uint			size;
+
+	ASSERT(nextents > 0);
+	if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
+		size = (uint)(sizeof(xfs_efi_log_item_t) +
+			((nextents - 1) * sizeof(xfs_extent_t)));
+		efip = (xfs_efi_log_item_t*)kmem_zalloc(size, KM_SLEEP);
+	} else {
+		efip = (xfs_efi_log_item_t*)kmem_zone_zalloc(xfs_efi_zone,
+							     KM_SLEEP);
+	}
+
+	efip->efi_item.li_type = XFS_LI_EFI;
+	efip->efi_item.li_ops = &xfs_efi_item_ops;
+	efip->efi_item.li_mountp = mp;
+	efip->efi_format.efi_nextents = nextents;
+	efip->efi_format.efi_id = (__psint_t)(void*)efip;
+
+	return (efip);
+}
+
+/*
+ * This is called by the efd item code below to release references to
+ * the given efi item.	Each efd calls this with the number of
+ * extents that it has logged, and when the sum of these reaches
+ * the total number of extents logged by this efi item we can free
+ * the efi item.
+ *
+ * Freeing the efi item requires that we remove it from the AIL.
+ * We'll use the AIL lock to protect our counters as well as
+ * the removal from the AIL.
+ */
+void
+xfs_efi_release(xfs_efi_log_item_t	*efip,
+		uint			nextents)
+{
+	xfs_mount_t	*mp;
+	int		extents_left;
+	uint		size;
+	int		nexts;
+	SPLDECL(s);
+
+	mp = efip->efi_item.li_mountp;
+	ASSERT(efip->efi_next_extent > 0);
+	ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
+
+	AIL_LOCK(mp, s);
+	ASSERT(efip->efi_next_extent >= nextents);
+	efip->efi_next_extent -= nextents;
+	extents_left = efip->efi_next_extent;
+	if (extents_left == 0) {
+		/*
+		 * xfs_trans_delete_ail() drops the AIL lock.
+		 */
+		xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s);
+	} else {
+		AIL_UNLOCK(mp, s);
+	}
+
+	if (extents_left == 0) {
+		nexts = efip->efi_format.efi_nextents;
+		if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
+			size = sizeof(xfs_efi_log_item_t);
+			size += (nexts - 1) * sizeof(xfs_extent_t);
+			kmem_free(efip, size);
+		} else {
+			kmem_zone_free(xfs_efi_zone, efip);
+		}
+	}
+}
+
+/*
+ * This is called when the transaction that should be committing the
+ * EFD corresponding to the given EFI is aborted.  The committed and
+ * canceled flags are used to coordinate the freeing of the EFI and
+ * the references by the transaction that committed it.
+ */
+STATIC void
+xfs_efi_cancel(
+	xfs_efi_log_item_t	*efip)
+{
+	int		nexts;
+	int		size;
+	xfs_mount_t	*mp;
+	SPLDECL(s);
+
+	mp = efip->efi_item.li_mountp;
+	AIL_LOCK(mp, s);
+	if (efip->efi_flags & XFS_EFI_COMMITTED) {
+		/*
+		 * xfs_trans_delete_ail() drops the AIL lock.
+		 */
+		xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s);
+
+		nexts = efip->efi_format.efi_nextents;
+		if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
+			size = sizeof(xfs_efi_log_item_t);
+			size += (nexts - 1) * sizeof(xfs_extent_t);
+			kmem_free(efip, size);
+		} else {
+			kmem_zone_free(xfs_efi_zone, efip);
+		}
+	} else {
+		efip->efi_flags |= XFS_EFI_CANCELED;
+		AIL_UNLOCK(mp, s);
+	}
+
+	return;
+}
+
+
+
+
+
+/*
+ * This returns the number of iovecs needed to log the given efd item.
+ * We only need 1 iovec for an efd item.  It just logs the efd_log_format
+ * structure.
+ */
+/*ARGSUSED*/
+STATIC uint
+xfs_efd_item_size(xfs_efd_log_item_t *efdp)
+{
+	return 1;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given efd log item. We use only 1 iovec, and we point that
+ * at the efd_log_format structure embedded in the efd item.
+ * It is at this point that we assert that all of the extent
+ * slots in the efd item have been filled.
+ */
+STATIC void
+xfs_efd_item_format(xfs_efd_log_item_t	*efdp,
+		    xfs_log_iovec_t	*log_vector)
+{
+	uint	size;
+
+	ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
+
+	efdp->efd_format.efd_type = XFS_LI_EFD;
+
+	size = sizeof(xfs_efd_log_format_t);
+	size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
+	efdp->efd_format.efd_size = 1;
+
+	log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format);
+	log_vector->i_len = size;
+	ASSERT(size >= sizeof(xfs_efd_log_format_t));
+}
+
+
+/*
+ * Pinning has no meaning for an efd item, so just return.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efd_item_pin(xfs_efd_log_item_t *efdp)
+{
+	return;
+}
+
+
+/*
+ * Since pinning has no meaning for an efd item, unpinning does
+ * not either.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efd_item_unpin(xfs_efd_log_item_t *efdp)
+{
+	return;
+}
+
+/*ARGSUSED*/
+STATIC void
+xfs_efd_item_unpin_remove(xfs_efd_log_item_t *efdp, xfs_trans_t *tp)
+{
+	return;
+}
+
+/*
+ * Efd items have no locking, so just return success.
+ */
+/*ARGSUSED*/
+STATIC uint
+xfs_efd_item_trylock(xfs_efd_log_item_t *efdp)
+{
+	return XFS_ITEM_LOCKED;
+}
+
+/*
+ * Efd items have no locking or pushing, so return failure
+ * so that the caller doesn't bother with us.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efd_item_unlock(xfs_efd_log_item_t *efdp)
+{
+	if (efdp->efd_item.li_flags & XFS_LI_ABORTED)
+		xfs_efd_item_abort(efdp);
+	return;
+}
+
+/*
+ * When the efd item is committed to disk, all we need to do
+ * is delete our reference to our partner efi item and then
+ * free ourselves.  Since we're freeing ourselves we must
+ * return -1 to keep the transaction code from further referencing
+ * this item.
+ */
+/*ARGSUSED*/
+STATIC xfs_lsn_t
+xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn)
+{
+	uint	size;
+	int	nexts;
+
+	/*
+	 * If we got a log I/O error, it's always the case that the LR with the
+	 * EFI got unpinned and freed before the EFD got aborted.
+	 */
+	if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0)
+		xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents);
+
+	nexts = efdp->efd_format.efd_nextents;
+	if (nexts > XFS_EFD_MAX_FAST_EXTENTS) {
+		size = sizeof(xfs_efd_log_item_t);
+		size += (nexts - 1) * sizeof(xfs_extent_t);
+		kmem_free(efdp, size);
+	} else {
+		kmem_zone_free(xfs_efd_zone, efdp);
+	}
+
+	return (xfs_lsn_t)-1;
+}
+
+/*
+ * The transaction of which this EFD is a part has been aborted.
+ * Inform its companion EFI of this fact and then clean up after
+ * ourselves.  No need to clean up the slot for the item in the
+ * transaction.	 That was done by the unpin code which is called
+ * prior to this routine in the abort/fs-shutdown path.
+ */
+STATIC void
+xfs_efd_item_abort(xfs_efd_log_item_t *efdp)
+{
+	int	nexts;
+	int	size;
+
+	/*
+	 * If we got a log I/O error, it's always the case that the LR with the
+	 * EFI got unpinned and freed before the EFD got aborted. So don't
+	 * reference the EFI at all in that case.
+	 */
+	if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0)
+		xfs_efi_cancel(efdp->efd_efip);
+
+	nexts = efdp->efd_format.efd_nextents;
+	if (nexts > XFS_EFD_MAX_FAST_EXTENTS) {
+		size = sizeof(xfs_efd_log_item_t);
+		size += (nexts - 1) * sizeof(xfs_extent_t);
+		kmem_free(efdp, size);
+	} else {
+		kmem_zone_free(xfs_efd_zone, efdp);
+	}
+	return;
+}
+
+/*
+ * There isn't much you can do to push on an efd item.	It is simply
+ * stuck waiting for the log to be flushed to disk.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efd_item_push(xfs_efd_log_item_t *efdp)
+{
+	return;
+}
+
+/*
+ * The EFD dependency tracking op doesn't do squat.  It can't because
+ * it doesn't know where the free extent is coming from.  The dependency
+ * tracking has to be handled by the "enclosing" metadata object.  For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+/*ARGSUSED*/
+STATIC void
+xfs_efd_item_committing(xfs_efd_log_item_t *efip, xfs_lsn_t lsn)
+{
+	return;
+}
+
+/*
+ * This is the ops vector shared by all efd log items.
+ */
+struct xfs_item_ops xfs_efd_item_ops = {
+	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_efd_item_size,
+	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+					xfs_efd_item_format,
+	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_efd_item_pin,
+	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_efd_item_unpin,
+	.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
+					xfs_efd_item_unpin_remove,
+	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock,
+	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_efd_item_unlock,
+	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_efd_item_committed,
+	.iop_push	= (void(*)(xfs_log_item_t*))xfs_efd_item_push,
+	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_efd_item_abort,
+	.iop_pushbuf	= NULL,
+	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_efd_item_committing
+};
+
+
+/*
+ * Allocate and initialize an efd item with the given number of extents.
+ */
+xfs_efd_log_item_t *
+xfs_efd_init(xfs_mount_t	*mp,
+	     xfs_efi_log_item_t *efip,
+	     uint		nextents)
+
+{
+	xfs_efd_log_item_t	*efdp;
+	uint			size;
+
+	ASSERT(nextents > 0);
+	if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
+		size = (uint)(sizeof(xfs_efd_log_item_t) +
+			((nextents - 1) * sizeof(xfs_extent_t)));
+		efdp = (xfs_efd_log_item_t*)kmem_zalloc(size, KM_SLEEP);
+	} else {
+		efdp = (xfs_efd_log_item_t*)kmem_zone_zalloc(xfs_efd_zone,
+							     KM_SLEEP);
+	}
+
+	efdp->efd_item.li_type = XFS_LI_EFD;
+	efdp->efd_item.li_ops = &xfs_efd_item_ops;
+	efdp->efd_item.li_mountp = mp;
+	efdp->efd_efip = efip;
+	efdp->efd_format.efd_nextents = nextents;
+	efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
+
+	return (efdp);
+}
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
new file mode 100644
index 000000000000..d3bf293c2b96
--- /dev/null
+++ b/fs/xfs/xfs_extfree_item.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_EXTFREE_ITEM_H__
+#define __XFS_EXTFREE_ITEM_H__
+
+struct xfs_mount;
+struct kmem_zone;
+
+typedef struct xfs_extent {
+	xfs_dfsbno_t	ext_start;
+	xfs_extlen_t	ext_len;
+} xfs_extent_t;
+
+/*
+ * This is the structure used to lay out an efi log item in the
+ * log.	 The efi_extents field is a variable size array whose
+ * size is given by efi_nextents.
+ */
+typedef struct xfs_efi_log_format {
+	unsigned short		efi_type;	/* efi log item type */
+	unsigned short		efi_size;	/* size of this item */
+	uint			efi_nextents;	/* # extents to free */
+	__uint64_t		efi_id;		/* efi identifier */
+	xfs_extent_t		efi_extents[1]; /* array of extents to free */
+} xfs_efi_log_format_t;
+
+/*
+ * This is the structure used to lay out an efd log item in the
+ * log.	 The efd_extents array is a variable size array whose
+ * size is given by efd_nextents;
+ */
+typedef struct xfs_efd_log_format {
+	unsigned short		efd_type;	/* efd log item type */
+	unsigned short		efd_size;	/* size of this item */
+	uint			efd_nextents;	/* # of extents freed */
+	__uint64_t		efd_efi_id;	/* id of corresponding efi */
+	xfs_extent_t		efd_extents[1]; /* array of extents freed */
+} xfs_efd_log_format_t;
+
+
+#ifdef __KERNEL__
+
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define XFS_EFI_MAX_FAST_EXTENTS	16
+
+/*
+ * Define EFI flags.
+ */
+#define XFS_EFI_RECOVERED	0x1
+#define XFS_EFI_COMMITTED	0x2
+#define XFS_EFI_CANCELED	0x4
+
+/*
+ * This is the "extent free intention" log item.  It is used
+ * to log the fact that some extents need to be free.  It is
+ * used in conjunction with the "extent free done" log item
+ * described below.
+ */
+typedef struct xfs_efi_log_item {
+	xfs_log_item_t		efi_item;
+	uint			efi_flags;	/* misc flags */
+	uint			efi_next_extent;
+	xfs_efi_log_format_t	efi_format;
+} xfs_efi_log_item_t;
+
+/*
+ * This is the "extent free done" log item.  It is used to log
+ * the fact that some extents earlier mentioned in an efi item
+ * have been freed.
+ */
+typedef struct xfs_efd_log_item {
+	xfs_log_item_t		efd_item;
+	xfs_efi_log_item_t	*efd_efip;
+	uint			efd_next_extent;
+	xfs_efd_log_format_t	efd_format;
+} xfs_efd_log_item_t;
+
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define XFS_EFD_MAX_FAST_EXTENTS	16
+
+extern struct kmem_zone	*xfs_efi_zone;
+extern struct kmem_zone	*xfs_efd_zone;
+
+xfs_efi_log_item_t	*xfs_efi_init(struct xfs_mount *, uint);
+xfs_efd_log_item_t	*xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *,
+				      uint);
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
new file mode 100644
index 000000000000..7984d92618fd
--- /dev/null
+++ b/fs/xfs/xfs_fs.h
@@ -0,0 +1,510 @@
+/*
+ * Copyright (c) 1995-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307,
+ * USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef _LINUX_XFS_FS_H
+#define _LINUX_XFS_FS_H
+
+#include <linux/types.h>
+#include <asm/ioctl.h>
+
+
+/*
+ * SGI's XFS filesystem's major stuff (constants, structures)
+ */
+
+#define XFS_NAME	"xfs"
+
+/*
+ * Direct I/O attribute record used with XFS_IOC_DIOINFO
+ * d_miniosz is the min xfer size, xfer size multiple and file seek offset
+ * alignment.
+ */
+struct dioattr {
+	__u32		d_mem;		/* data buffer memory alignment */
+	__u32		d_miniosz;	/* min xfer size		*/
+	__u32		d_maxiosz;	/* max xfer size		*/
+};
+
+/*
+ * Structure for XFS_IOC_FSGETXATTR[A] and XFS_IOC_FSSETXATTR.
+ */
+struct fsxattr {
+	__u32		fsx_xflags;	/* xflags field value (get/set) */
+	__u32		fsx_extsize;	/* extsize field value (get/set)*/
+	__u32		fsx_nextents;	/* nextents field value (get)	*/
+	unsigned char	fsx_pad[16];
+};
+
+/*
+ * Flags for the bs_xflags/fsx_xflags field
+ * There should be a one-to-one correspondence between these flags and the
+ * XFS_DIFLAG_s.
+ */
+#define XFS_XFLAG_REALTIME	0x00000001
+#define XFS_XFLAG_PREALLOC	0x00000002
+#define XFS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this	*/
+#define XFS_XFLAG_ALL		\
+	( XFS_XFLAG_REALTIME|XFS_XFLAG_PREALLOC|XFS_XFLAG_HASATTR )
+
+
+/*
+ * Structure for XFS_IOC_GETBMAP.
+ * On input, fill in bmv_offset and bmv_length of the first structure
+ * to indicate the area of interest in the file, and bmv_entry with the
+ * number of array elements given.  The first structure is updated on
+ * return to give the offset and length for the next call.
+ */
+struct getbmap {
+	__s64		bmv_offset;	/* file offset of segment in blocks */
+	__s64		bmv_block;	/* starting block (64-bit daddr_t)  */
+	__s64		bmv_length;	/* length of segment, blocks	    */
+	__s32		bmv_count;	/* # of entries in array incl. 1st  */
+	__s32		bmv_entries;	/* # of entries filled in (output)  */
+};
+
+/*
+ *	Structure for XFS_IOC_GETBMAPX.	 Fields bmv_offset through bmv_entries
+ *	are used exactly as in the getbmap structure.  The getbmapx structure
+ *	has additional bmv_iflags and bmv_oflags fields. The bmv_iflags field
+ *	is only used for the first structure.  It contains input flags
+ *	specifying XFS_IOC_GETBMAPX actions.  The bmv_oflags field is filled
+ *	in by the XFS_IOC_GETBMAPX command for each returned structure after
+ *	the first.
+ */
+struct getbmapx {
+	__s64		bmv_offset;	/* file offset of segment in blocks */
+	__s64		bmv_block;	/* starting block (64-bit daddr_t)  */
+	__s64		bmv_length;	/* length of segment, blocks	    */
+	__s32		bmv_count;	/* # of entries in array incl. 1st  */
+	__s32		bmv_entries;	/* # of entries filled in (output). */
+	__s32		bmv_iflags;	/* input flags (1st structure)	    */
+	__s32		bmv_oflags;	/* output flags (after 1st structure)*/
+	__s32		bmv_unused1;	/* future use			    */
+	__s32		bmv_unused2;	/* future use			    */
+};
+
+/*	bmv_iflags values - set by XFS_IOC_GETBMAPX caller.	*/
+#define BMV_IF_ATTRFORK		0x1	/* return attr fork rather than data */
+#define BMV_IF_NO_DMAPI_READ	0x2	/* Do not generate DMAPI read event  */
+#define BMV_IF_PREALLOC		0x4	/* rtn status BMV_OF_PREALLOC if req */
+#define BMV_IF_VALID	(BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC)
+#ifdef __KERNEL__
+#define BMV_IF_EXTENDED 0x40000000	/* getpmapx if set */
+#endif
+
+/*	bmv_oflags values - returned for for each non-header segment */
+#define BMV_OF_PREALLOC		0x1	/* segment = unwritten pre-allocation */
+
+/*	Convert getbmap <-> getbmapx - move fields from p1 to p2. */
+#define GETBMAP_CONVERT(p1,p2) {	\
+	p2.bmv_offset = p1.bmv_offset;	\
+	p2.bmv_block = p1.bmv_block;	\
+	p2.bmv_length = p1.bmv_length;	\
+	p2.bmv_count = p1.bmv_count;	\
+	p2.bmv_entries = p1.bmv_entries;  }
+
+
+/*
+ * Structure for XFS_IOC_FSSETDM.
+ * For use by backup and restore programs to set the XFS on-disk inode
+ * fields di_dmevmask and di_dmstate.  These must be set to exactly and
+ * only values previously obtained via xfs_bulkstat!  (Specifically the
+ * xfs_bstat_t fields bs_dmevmask and bs_dmstate.)
+ */
+struct fsdmidata {
+	__u32		fsd_dmevmask;	/* corresponds to di_dmevmask */
+	__u16		fsd_padding;
+	__u16		fsd_dmstate;	/* corresponds to di_dmstate  */
+};
+
+/*
+ * File segment locking set data type for 64 bit access.
+ * Also used for all the RESV/FREE interfaces.
+ */
+typedef struct xfs_flock64 {
+	__s16		l_type;
+	__s16		l_whence;
+	__s64		l_start;
+	__s64		l_len;		/* len == 0 means until end of file */
+	__s32		l_sysid;
+	pid_t		l_pid;
+	__s32		l_pad[4];	/* reserve area			    */
+} xfs_flock64_t;
+
+/*
+ * Output for XFS_IOC_FSGEOMETRY_V1
+ */
+typedef struct xfs_fsop_geom_v1 {
+	__u32		blocksize;	/* filesystem (data) block size */
+	__u32		rtextsize;	/* realtime extent size		*/
+	__u32		agblocks;	/* fsblocks in an AG		*/
+	__u32		agcount;	/* number of allocation groups	*/
+	__u32		logblocks;	/* fsblocks in the log		*/
+	__u32		sectsize;	/* (data) sector size, bytes	*/
+	__u32		inodesize;	/* inode size in bytes		*/
+	__u32		imaxpct;	/* max allowed inode space(%)	*/
+	__u64		datablocks;	/* fsblocks in data subvolume	*/
+	__u64		rtblocks;	/* fsblocks in realtime subvol	*/
+	__u64		rtextents;	/* rt extents in realtime subvol*/
+	__u64		logstart;	/* starting fsblock of the log	*/
+	unsigned char	uuid[16];	/* unique id of the filesystem	*/
+	__u32		sunit;		/* stripe unit, fsblocks	*/
+	__u32		swidth;		/* stripe width, fsblocks	*/
+	__s32		version;	/* structure version		*/
+	__u32		flags;		/* superblock version flags	*/
+	__u32		logsectsize;	/* log sector size, bytes	*/
+	__u32		rtsectsize;	/* realtime sector size, bytes	*/
+	__u32		dirblocksize;	/* directory block size, bytes	*/
+} xfs_fsop_geom_v1_t;
+
+/*
+ * Output for XFS_IOC_FSGEOMETRY
+ */
+typedef struct xfs_fsop_geom {
+	__u32		blocksize;	/* filesystem (data) block size */
+	__u32		rtextsize;	/* realtime extent size		*/
+	__u32		agblocks;	/* fsblocks in an AG		*/
+	__u32		agcount;	/* number of allocation groups	*/
+	__u32		logblocks;	/* fsblocks in the log		*/
+	__u32		sectsize;	/* (data) sector size, bytes	*/
+	__u32		inodesize;	/* inode size in bytes		*/
+	__u32		imaxpct;	/* max allowed inode space(%)	*/
+	__u64		datablocks;	/* fsblocks in data subvolume	*/
+	__u64		rtblocks;	/* fsblocks in realtime subvol	*/
+	__u64		rtextents;	/* rt extents in realtime subvol*/
+	__u64		logstart;	/* starting fsblock of the log	*/
+	unsigned char	uuid[16];	/* unique id of the filesystem	*/
+	__u32		sunit;		/* stripe unit, fsblocks	*/
+	__u32		swidth;		/* stripe width, fsblocks	*/
+	__s32		version;	/* structure version		*/
+	__u32		flags;		/* superblock version flags	*/
+	__u32		logsectsize;	/* log sector size, bytes	*/
+	__u32		rtsectsize;	/* realtime sector size, bytes	*/
+	__u32		dirblocksize;	/* directory block size, bytes	*/
+	__u32		logsunit;	/* log stripe unit, bytes */
+} xfs_fsop_geom_t;
+
+/* Output for XFS_FS_COUNTS */
+typedef struct xfs_fsop_counts {
+	__u64	freedata;	/* free data section blocks */
+	__u64	freertx;	/* free rt extents */
+	__u64	freeino;	/* free inodes */
+	__u64	allocino;	/* total allocated inodes */
+} xfs_fsop_counts_t;
+
+/* Input/Output for XFS_GET_RESBLKS and XFS_SET_RESBLKS */
+typedef struct xfs_fsop_resblks {
+	__u64  resblks;
+	__u64  resblks_avail;
+} xfs_fsop_resblks_t;
+
+#define XFS_FSOP_GEOM_VERSION	0
+
+#define XFS_FSOP_GEOM_FLAGS_ATTR	0x01	/* attributes in use	*/
+#define XFS_FSOP_GEOM_FLAGS_NLINK	0x02	/* 32-bit nlink values	*/
+#define XFS_FSOP_GEOM_FLAGS_QUOTA	0x04	/* quotas enabled	*/
+#define XFS_FSOP_GEOM_FLAGS_IALIGN	0x08	/* inode alignment	*/
+#define XFS_FSOP_GEOM_FLAGS_DALIGN	0x10	/* large data alignment */
+#define XFS_FSOP_GEOM_FLAGS_SHARED	0x20	/* read-only shared	*/
+#define XFS_FSOP_GEOM_FLAGS_EXTFLG	0x40	/* special extent flag	*/
+#define XFS_FSOP_GEOM_FLAGS_DIRV2	0x80	/* directory version 2	*/
+#define XFS_FSOP_GEOM_FLAGS_LOGV2      0x100	/* log format version 2 */
+
+
+/*
+ * Minimum and maximum sizes need for growth checks
+ */
+#define XFS_MIN_AG_BLOCKS	64
+#define XFS_MIN_LOG_BLOCKS	512
+#define XFS_MAX_LOG_BLOCKS	(64 * 1024)
+#define XFS_MIN_LOG_BYTES	(256 * 1024)
+#define XFS_MAX_LOG_BYTES	(128 * 1024 * 1024)
+
+/*
+ * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT
+ */
+typedef struct xfs_growfs_data {
+	__u64		newblocks;	/* new data subvol size, fsblocks */
+	__u32		imaxpct;	/* new inode space percentage limit */
+} xfs_growfs_data_t;
+
+typedef struct xfs_growfs_log {
+	__u32		newblocks;	/* new log size, fsblocks */
+	__u32		isint;		/* 1 if new log is internal */
+} xfs_growfs_log_t;
+
+typedef struct xfs_growfs_rt {
+	__u64		newblocks;	/* new realtime size, fsblocks */
+	__u32		extsize;	/* new realtime extent size, fsblocks */
+} xfs_growfs_rt_t;
+
+
+/*
+ * Structures returned from ioctl XFS_IOC_FSBULKSTAT & XFS_IOC_FSBULKSTAT_SINGLE
+ */
+typedef struct xfs_bstime {
+	time_t		tv_sec;		/* seconds		*/
+	__s32		tv_nsec;	/* and nanoseconds	*/
+} xfs_bstime_t;
+
+typedef struct xfs_bstat {
+	__u64		bs_ino;		/* inode number			*/
+	__u16		bs_mode;	/* type and mode		*/
+	__u16		bs_nlink;	/* number of links		*/
+	__u32		bs_uid;		/* user id			*/
+	__u32		bs_gid;		/* group id			*/
+	__u32		bs_rdev;	/* device value			*/
+	__s32		bs_blksize;	/* block size			*/
+	__s64		bs_size;	/* file size			*/
+	xfs_bstime_t	bs_atime;	/* access time			*/
+	xfs_bstime_t	bs_mtime;	/* modify time			*/
+	xfs_bstime_t	bs_ctime;	/* inode change time		*/
+	int64_t		bs_blocks;	/* number of blocks		*/
+	__u32		bs_xflags;	/* extended flags		*/
+	__s32		bs_extsize;	/* extent size			*/
+	__s32		bs_extents;	/* number of extents		*/
+	__u32		bs_gen;		/* generation count		*/
+	__u16		bs_projid;	/* project id			*/
+	unsigned char	bs_pad[14];	/* pad space, unused		*/
+	__u32		bs_dmevmask;	/* DMIG event mask		*/
+	__u16		bs_dmstate;	/* DMIG state info		*/
+	__u16		bs_aextents;	/* attribute number of extents	*/
+} xfs_bstat_t;
+
+/*
+ * The user-level BulkStat Request interface structure.
+ */
+typedef struct xfs_fsop_bulkreq {
+	__u64		*lastip;	/* last inode # pointer		*/
+	__s32		icount;		/* count of entries in buffer	*/
+	void		*ubuffer;	/* user buffer for inode desc.	*/
+	__s32		*ocount;	/* output count pointer		*/
+} xfs_fsop_bulkreq_t;
+
+
+/*
+ * Structures returned from xfs_inumbers routine (XFS_IOC_FSINUMBERS).
+ */
+typedef struct xfs_inogrp {
+	__u64		xi_startino;	/* starting inode number	*/
+	__s32		xi_alloccount;	/* # bits set in allocmask	*/
+	__u64		xi_allocmask;	/* mask of allocated inodes	*/
+} xfs_inogrp_t;
+
+
+/*
+ * Error injection.
+ */
+typedef struct xfs_error_injection {
+	__s32		fd;
+	__s32		errtag;
+} xfs_error_injection_t;
+
+
+/*
+ * The user-level Handle Request interface structure.
+ */
+typedef struct xfs_fsop_handlereq {
+	__u32		fd;		/* fd for FD_TO_HANDLE		*/
+	void		*path;		/* user pathname		*/
+	__u32		oflags;		/* open flags			*/
+	void		*ihandle;	/* user supplied handle		*/
+	__u32		ihandlen;	/* user supplied length		*/
+	void		*ohandle;	/* user buffer for handle	*/
+	__u32		*ohandlen;	/* user buffer length		*/
+} xfs_fsop_handlereq_t;
+
+/*
+ * Compound structures for passing args through Handle Request interfaces
+ * xfs_fssetdm_by_handle, xfs_attrlist_by_handle, xfs_attrmulti_by_handle
+ * - ioctls: XFS_IOC_FSSETDM_BY_HANDLE, XFS_IOC_ATTRLIST_BY_HANDLE, and
+ *	     XFS_IOC_ATTRMULTI_BY_HANDLE
+ */
+
+typedef struct xfs_fsop_setdm_handlereq {
+	struct xfs_fsop_handlereq hreq; /* handle interface structure */
+	struct fsdmidata *data;		/* DMAPI data to set	      */
+} xfs_fsop_setdm_handlereq_t;
+
+typedef struct xfs_attrlist_cursor {
+	__u32	opaque[4];
+} xfs_attrlist_cursor_t;
+
+typedef struct xfs_fsop_attrlist_handlereq {
+	struct xfs_fsop_handlereq hreq; /* handle interface structure */
+	struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */
+	__u32 flags;			/* flags, use ROOT/USER names */
+	__u32 buflen;			/* length of buffer supplied  */
+	void *buffer;			/* attrlist data to return    */
+} xfs_fsop_attrlist_handlereq_t;
+
+typedef struct xfs_attr_multiop {
+	__u32	am_opcode;
+	__s32	am_error;
+	void	*am_attrname;
+	void	*am_attrvalue;
+	__u32	am_length;
+	__u32	am_flags;
+} xfs_attr_multiop_t;
+
+typedef struct xfs_fsop_attrmulti_handlereq {
+	struct xfs_fsop_handlereq hreq; /* handle interface structure */
+	__u32 opcount;			/* count of following multiop */
+	struct xfs_attr_multiop *ops;	/* attr_multi data to get/set */
+} xfs_fsop_attrmulti_handlereq_t;
+
+/*
+ * File system identifier. Should be unique (at least per machine).
+ */
+typedef struct {
+	__u32 val[2];			/* file system id type */
+} xfs_fsid_t;
+
+/*
+ * File identifier.  Should be unique per filesystem on a single machine.
+ * This is typically called by a stateless file server in order to generate
+ * "file handles".
+ */
+#define MAXFIDSZ	46
+typedef struct fid {
+	__u16		fid_len;		/* length of data in bytes */
+	unsigned char	fid_data[MAXFIDSZ];	/* data (variable length)  */
+} fid_t;
+
+typedef struct xfs_fid {
+	__u16	xfs_fid_len;		/* length of remainder	*/
+	__u16	xfs_fid_pad;
+	__u32	xfs_fid_gen;		/* generation number	*/
+	__u64	xfs_fid_ino;		/* 64 bits inode number */
+} xfs_fid_t;
+
+typedef struct xfs_fid2 {
+	__u16	fid_len;	/* length of remainder */
+	__u16	fid_pad;	/* padding, must be zero */
+	__u32	fid_gen;	/* generation number */
+	__u64	fid_ino;	/* inode number */
+} xfs_fid2_t;
+
+typedef struct xfs_handle {
+	union {
+		__s64	    align;	/* force alignment of ha_fid	 */
+		xfs_fsid_t  _ha_fsid;	/* unique file system identifier */
+	} ha_u;
+	xfs_fid_t	ha_fid;		/* file system specific file ID	 */
+} xfs_handle_t;
+#define ha_fsid ha_u._ha_fsid
+
+#define XFS_HSIZE(handle)	(((char *) &(handle).ha_fid.xfs_fid_pad	 \
+				 - (char *) &(handle))			  \
+				 + (handle).ha_fid.xfs_fid_len)
+
+#define XFS_HANDLE_CMP(h1, h2)	bcmp(h1, h2, sizeof (xfs_handle_t))
+
+#define FSHSIZE		sizeof (fsid_t)
+
+
+/*
+ * ioctl commands that replace IRIX fcntl()'s
+ * For 'documentation' purposed more than anything else,
+ * the "cmd #" field reflects the IRIX fcntl number.
+ */
+#define XFS_IOC_ALLOCSP		_IOW ('X', 10, struct xfs_flock64)
+#define XFS_IOC_FREESP		_IOW ('X', 11, struct xfs_flock64)
+#define XFS_IOC_DIOINFO		_IOR ('X', 30, struct dioattr)
+#define XFS_IOC_FSGETXATTR	_IOR ('X', 31, struct fsxattr)
+#define XFS_IOC_FSSETXATTR	_IOW ('X', 32, struct fsxattr)
+#define XFS_IOC_ALLOCSP64	_IOW ('X', 36, struct xfs_flock64)
+#define XFS_IOC_FREESP64	_IOW ('X', 37, struct xfs_flock64)
+#define XFS_IOC_GETBMAP		_IOWR('X', 38, struct getbmap)
+#define XFS_IOC_FSSETDM		_IOW ('X', 39, struct fsdmidata)
+#define XFS_IOC_RESVSP		_IOW ('X', 40, struct xfs_flock64)
+#define XFS_IOC_UNRESVSP	_IOW ('X', 41, struct xfs_flock64)
+#define XFS_IOC_RESVSP64	_IOW ('X', 42, struct xfs_flock64)
+#define XFS_IOC_UNRESVSP64	_IOW ('X', 43, struct xfs_flock64)
+#define XFS_IOC_GETBMAPA	_IOWR('X', 44, struct getbmap)
+#define XFS_IOC_FSGETXATTRA	_IOR ('X', 45, struct fsxattr)
+/*	XFS_IOC_SETBIOSIZE ---- deprecated 46	   */
+/*	XFS_IOC_GETBIOSIZE ---- deprecated 47	   */
+#define XFS_IOC_GETBMAPX	_IOWR('X', 56, struct getbmap)
+
+/*
+ * ioctl commands that replace IRIX syssgi()'s
+ */
+#define XFS_IOC_FSGEOMETRY_V1	     _IOR ('X', 100, struct xfs_fsop_geom_v1)
+#define XFS_IOC_FSBULKSTAT	     _IOWR('X', 101, struct xfs_fsop_bulkreq)
+#define XFS_IOC_FSBULKSTAT_SINGLE    _IOWR('X', 102, struct xfs_fsop_bulkreq)
+#define XFS_IOC_FSINUMBERS	     _IOWR('X', 103, struct xfs_fsop_bulkreq)
+#define XFS_IOC_PATH_TO_FSHANDLE     _IOWR('X', 104, struct xfs_fsop_handlereq)
+#define XFS_IOC_PATH_TO_HANDLE	     _IOWR('X', 105, struct xfs_fsop_handlereq)
+#define XFS_IOC_FD_TO_HANDLE	     _IOWR('X', 106, struct xfs_fsop_handlereq)
+#define XFS_IOC_OPEN_BY_HANDLE	     _IOWR('X', 107, struct xfs_fsop_handlereq)
+#define XFS_IOC_READLINK_BY_HANDLE   _IOWR('X', 108, struct xfs_fsop_handlereq)
+#define XFS_IOC_SWAPEXT		     _IOWR('X', 109, struct xfs_swapext)
+#define XFS_IOC_FSGROWFSDATA	     _IOW ('X', 110, struct xfs_growfs_data)
+#define XFS_IOC_FSGROWFSLOG	     _IOW ('X', 111, struct xfs_growfs_log)
+#define XFS_IOC_FSGROWFSRT	     _IOW ('X', 112, struct xfs_growfs_rt)
+#define XFS_IOC_FSCOUNTS	     _IOR ('X', 113, struct xfs_fsop_counts)
+#define XFS_IOC_SET_RESBLKS	     _IOR ('X', 114, struct xfs_fsop_resblks)
+#define XFS_IOC_GET_RESBLKS	     _IOR ('X', 115, struct xfs_fsop_resblks)
+#define XFS_IOC_ERROR_INJECTION	     _IOW ('X', 116, struct xfs_error_injection)
+#define XFS_IOC_ERROR_CLEARALL	     _IOW ('X', 117, struct xfs_error_injection)
+/*	XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118	 */
+#define XFS_IOC_FREEZE		     _IOWR('X', 119, int)
+#define XFS_IOC_THAW		     _IOWR('X', 120, int)
+#define XFS_IOC_FSSETDM_BY_HANDLE    _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
+#define XFS_IOC_ATTRLIST_BY_HANDLE   _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
+#define XFS_IOC_ATTRMULTI_BY_HANDLE  _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
+#define XFS_IOC_FSGEOMETRY	     _IOR ('X', 124, struct xfs_fsop_geom)
+/*	XFS_IOC_GETFSUUID ---------- deprecated 140	 */
+
+
+/*
+ * Block I/O parameterization.	A basic block (BB) is the lowest size of
+ * filesystem allocation, and must equal 512.  Length units given to bio
+ * routines are in BB's.
+ */
+#define BBSHIFT		9
+#define BBSIZE		(1<<BBSHIFT)
+#define BBMASK		(BBSIZE-1)
+#define BTOBB(bytes)	(((__u64)(bytes) + BBSIZE - 1) >> BBSHIFT)
+#define BTOBBT(bytes)	((__u64)(bytes) >> BBSHIFT)
+#define BBTOB(bbs)	((bbs) << BBSHIFT)
+#define OFFTOBB(bytes)	(((__u64)(bytes) + BBSIZE - 1) >> BBSHIFT)
+#define OFFTOBBT(bytes) ((__u64)(bytes) >> BBSHIFT)
+#define BBTOOFF(bbs)	((__u64)(bbs) << BBSHIFT)
+
+#define SEEKLIMIT32	0x7fffffff
+#define BBSEEKLIMIT32	BTOBBT(SEEKLIMIT32)
+#define SEEKLIMIT	0x7fffffffffffffffLL
+#define BBSEEKLIMIT	OFFTOBBT(SEEKLIMIT)
+
+#endif	/* _LINUX_XFS_FS_H */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
new file mode 100644
index 000000000000..4cd53ed2d791
--- /dev/null
+++ b/fs/xfs/xfs_fsops.c
@@ -0,0 +1,598 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+/*
+ * File system operations
+ */
+
+int
+xfs_fs_geometry(
+	xfs_mount_t		*mp,
+	xfs_fsop_geom_t		*geo,
+	int			new_version)
+{
+	geo->blocksize = mp->m_sb.sb_blocksize;
+	geo->rtextsize = mp->m_sb.sb_rextsize;
+	geo->agblocks = mp->m_sb.sb_agblocks;
+	geo->agcount = mp->m_sb.sb_agcount;
+	geo->logblocks = mp->m_sb.sb_logblocks;
+	geo->sectsize = mp->m_sb.sb_sectsize;
+	geo->inodesize = mp->m_sb.sb_inodesize;
+	geo->imaxpct = mp->m_sb.sb_imax_pct;
+	geo->datablocks = mp->m_sb.sb_dblocks;
+	geo->rtblocks = mp->m_sb.sb_rblocks;
+	geo->rtextents = mp->m_sb.sb_rextents;
+	geo->logstart = mp->m_sb.sb_logstart;
+	ASSERT(sizeof(geo->uuid)==sizeof(mp->m_sb.sb_uuid));
+	memcpy(geo->uuid, &mp->m_sb.sb_uuid, sizeof(mp->m_sb.sb_uuid));
+	if (new_version >= 2) {
+		geo->sunit = mp->m_sb.sb_unit;
+		geo->swidth = mp->m_sb.sb_width;
+	}
+	if (new_version >= 3) {
+		geo->version = XFS_FSOP_GEOM_VERSION;
+		geo->flags =
+			(XFS_SB_VERSION_HASATTR(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_ATTR : 0) |
+			(XFS_SB_VERSION_HASNLINK(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_NLINK : 0) |
+			(XFS_SB_VERSION_HASQUOTA(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_QUOTA : 0) |
+			(XFS_SB_VERSION_HASALIGN(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_IALIGN : 0) |
+			(XFS_SB_VERSION_HASDALIGN(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_DALIGN : 0) |
+			(XFS_SB_VERSION_HASSHARED(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_SHARED : 0) |
+			(XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) |
+			(XFS_SB_VERSION_HASDIRV2(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_DIRV2 : 0);
+		geo->logsectsize = mp->m_sb.sb_sectsize;	/* XXX */
+		geo->rtsectsize = mp->m_sb.sb_sectsize;		/* XXX */
+		geo->dirblocksize = mp->m_dirblksize;
+	}
+	if (new_version >= 4) {
+		geo->flags |=
+			(XFS_SB_VERSION_HASLOGV2(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_LOGV2 : 0);
+		geo->logsunit = mp->m_sb.sb_logsunit;
+	}
+	return 0;
+}
+
+static int
+xfs_growfs_data_private(
+	xfs_mount_t		*mp,		/* mount point for filesystem */
+	xfs_growfs_data_t	*in)		/* growfs data input struct */
+{
+	xfs_agf_t		*agf;
+	xfs_agi_t		*agi;
+	xfs_agnumber_t		agno;
+	xfs_extlen_t		agsize;
+	xfs_extlen_t		tmpsize;
+	xfs_alloc_rec_t		*arec;
+	xfs_btree_sblock_t	*block;
+	xfs_buf_t		*bp;
+	int			bsize;
+	int			bucket;
+	xfs_daddr_t		disk_addr;
+	int			dpct;
+	int			error;
+	xfs_agnumber_t		nagcount;
+	xfs_rfsblock_t		nb, nb_mod;
+	xfs_rfsblock_t		new;
+	xfs_rfsblock_t		nfree;
+	xfs_agnumber_t		oagcount;
+	int			pct;
+	xfs_sb_t		*sbp;
+	int			sectbb;
+	xfs_trans_t		*tp;
+
+	nb = in->newblocks;
+	pct = in->imaxpct;
+	if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100)
+		return XFS_ERROR(EINVAL);
+	dpct = pct - mp->m_sb.sb_imax_pct;
+	error = xfs_read_buf(mp, mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - 1, 1,
+		0, &bp);
+	if (error)
+		return error;
+	ASSERT(bp);
+	xfs_buf_relse(bp);
+
+	new = nb;	/* use new as a temporary here */
+	nb_mod = do_div(new, mp->m_sb.sb_agblocks);
+	nagcount = new + (nb_mod != 0);
+	if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
+		nagcount--;
+		nb = nagcount * mp->m_sb.sb_agblocks;
+		if (nb < mp->m_sb.sb_dblocks)
+			return XFS_ERROR(EINVAL);
+	}
+	new = in->newblocks - mp->m_sb.sb_dblocks;
+	oagcount = mp->m_sb.sb_agcount;
+	if (nagcount > oagcount) {
+		down_write(&mp->m_peraglock);
+		mp->m_perag = kmem_realloc(mp->m_perag,
+			sizeof(xfs_perag_t) * nagcount,
+			sizeof(xfs_perag_t) * oagcount,
+			KM_SLEEP);
+		bzero(&mp->m_perag[oagcount],
+			(nagcount - oagcount) * sizeof(xfs_perag_t));
+		mp->m_flags |= XFS_MOUNT_32BITINODES;
+		xfs_initialize_perag(mp, nagcount);
+		up_write(&mp->m_peraglock);
+	}
+	tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
+	if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp),
+			XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+	/* new ag's */
+	sectbb = BTOBB(mp->m_sb.sb_sectsize);
+	bsize = mp->m_sb.sb_blocksize;
+
+	nfree = 0;
+	for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
+		/*
+		 * AG freelist header block
+		 */
+		disk_addr = XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR);
+		bp = xfs_buf_get(mp->m_ddev_targp,
+				  disk_addr,
+				  sectbb, 0);
+		agf = XFS_BUF_TO_AGF(bp);
+		bzero(agf, mp->m_sb.sb_sectsize);
+		INT_SET(agf->agf_magicnum, ARCH_CONVERT, XFS_AGF_MAGIC);
+		INT_SET(agf->agf_versionnum, ARCH_CONVERT, XFS_AGF_VERSION);
+		INT_SET(agf->agf_seqno, ARCH_CONVERT, agno);
+		if (agno == nagcount - 1)
+			agsize =
+				nb -
+				(agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
+		else
+			agsize = mp->m_sb.sb_agblocks;
+		INT_SET(agf->agf_length, ARCH_CONVERT, agsize);
+		INT_SET(agf->agf_roots[XFS_BTNUM_BNOi], ARCH_CONVERT, XFS_BNO_BLOCK(mp));
+		INT_SET(agf->agf_roots[XFS_BTNUM_CNTi], ARCH_CONVERT, XFS_CNT_BLOCK(mp));
+		INT_SET(agf->agf_levels[XFS_BTNUM_BNOi], ARCH_CONVERT, 1);
+		INT_SET(agf->agf_levels[XFS_BTNUM_CNTi], ARCH_CONVERT, 1);
+		INT_ZERO(agf->agf_flfirst, ARCH_CONVERT);
+		INT_SET(agf->agf_fllast, ARCH_CONVERT, XFS_AGFL_SIZE - 1);
+		INT_ZERO(agf->agf_flcount, ARCH_CONVERT);
+		tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp);
+		INT_SET(agf->agf_freeblks, ARCH_CONVERT, tmpsize);
+		INT_SET(agf->agf_longest, ARCH_CONVERT, tmpsize);
+		error = xfs_bwrite(mp, bp);
+		if (error) {
+			goto error0;
+		}
+		/*
+		 * AG inode header block
+		 */
+		disk_addr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR);
+		bp = xfs_buf_get(mp->m_ddev_targp,
+				  disk_addr,
+				  sectbb, 0);
+		agi = XFS_BUF_TO_AGI(bp);
+		bzero(agi, mp->m_sb.sb_sectsize);
+		INT_SET(agi->agi_magicnum, ARCH_CONVERT, XFS_AGI_MAGIC);
+		INT_SET(agi->agi_versionnum, ARCH_CONVERT, XFS_AGI_VERSION);
+		INT_SET(agi->agi_seqno, ARCH_CONVERT, agno);
+		INT_SET(agi->agi_length, ARCH_CONVERT, agsize);
+		INT_ZERO(agi->agi_count, ARCH_CONVERT);
+		INT_SET(agi->agi_root, ARCH_CONVERT, XFS_IBT_BLOCK(mp));
+		INT_SET(agi->agi_level, ARCH_CONVERT, 1);
+		INT_ZERO(agi->agi_freecount, ARCH_CONVERT);
+		INT_SET(agi->agi_newino, ARCH_CONVERT, NULLAGINO);
+		INT_SET(agi->agi_dirino, ARCH_CONVERT, NULLAGINO);
+		for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
+			INT_SET(agi->agi_unlinked[bucket], ARCH_CONVERT, NULLAGINO);
+		error = xfs_bwrite(mp, bp);
+		if (error) {
+			goto error0;
+		}
+		/*
+		 * BNO btree root block
+		 */
+		disk_addr = XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp));
+		bp = xfs_buf_get(mp->m_ddev_targp,
+			disk_addr,
+			BTOBB(bsize), 0);
+		block = XFS_BUF_TO_SBLOCK(bp);
+		bzero(block, bsize);
+		INT_SET(block->bb_magic, ARCH_CONVERT, XFS_ABTB_MAGIC);
+		INT_ZERO(block->bb_level, ARCH_CONVERT);
+		INT_SET(block->bb_numrecs, ARCH_CONVERT, 1);
+		INT_SET(block->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK);
+		INT_SET(block->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK);
+		arec = XFS_BTREE_REC_ADDR(bsize, xfs_alloc, block, 1,
+			mp->m_alloc_mxr[0]);
+		INT_SET(arec->ar_startblock, ARCH_CONVERT, XFS_PREALLOC_BLOCKS(mp));
+		INT_SET(arec->ar_blockcount, ARCH_CONVERT, agsize - INT_GET(arec->ar_startblock, ARCH_CONVERT));
+		error = xfs_bwrite(mp, bp);
+		if (error) {
+			goto error0;
+		}
+		/*
+		 * CNT btree root block
+		 */
+		disk_addr = XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp));
+		bp = xfs_buf_get(mp->m_ddev_targp,
+			disk_addr,
+			BTOBB(bsize), 0);
+		block = XFS_BUF_TO_SBLOCK(bp);
+		bzero(block, bsize);
+		INT_SET(block->bb_magic, ARCH_CONVERT, XFS_ABTC_MAGIC);
+		INT_ZERO(block->bb_level, ARCH_CONVERT);
+		INT_SET(block->bb_numrecs, ARCH_CONVERT, 1);
+		INT_SET(block->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK);
+		INT_SET(block->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK);
+		arec = XFS_BTREE_REC_ADDR(bsize, xfs_alloc, block, 1,
+			mp->m_alloc_mxr[0]);
+		INT_SET(arec->ar_startblock, ARCH_CONVERT, XFS_PREALLOC_BLOCKS(mp));
+		INT_SET(arec->ar_blockcount, ARCH_CONVERT, agsize - INT_GET(arec->ar_startblock, ARCH_CONVERT));
+		nfree += INT_GET(arec->ar_blockcount, ARCH_CONVERT);
+		error = xfs_bwrite(mp, bp);
+		if (error) {
+			goto error0;
+		}
+		/*
+		 * INO btree root block
+		 */
+		disk_addr = XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp));
+		bp = xfs_buf_get(mp->m_ddev_targp,
+			disk_addr,
+			BTOBB(bsize), 0);
+		block = XFS_BUF_TO_SBLOCK(bp);
+		bzero(block, bsize);
+		INT_SET(block->bb_magic, ARCH_CONVERT, XFS_IBT_MAGIC);
+		INT_ZERO(block->bb_level, ARCH_CONVERT);
+		INT_ZERO(block->bb_numrecs, ARCH_CONVERT);
+		INT_SET(block->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK);
+		INT_SET(block->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK);
+		error = xfs_bwrite(mp, bp);
+		if (error) {
+			goto error0;
+		}
+	}
+	xfs_trans_agblocks_delta(tp, nfree);
+	/*
+	 * There are new blocks in the old last a.g.
+	 */
+	if (new) {
+		/*
+		 * Change the agi length.
+		 */
+		error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
+		if (error) {
+			goto error0;
+		}
+		ASSERT(bp);
+		agi = XFS_BUF_TO_AGI(bp);
+		INT_MOD(agi->agi_length, ARCH_CONVERT, new);
+		ASSERT(nagcount == oagcount
+		       || INT_GET(agi->agi_length, ARCH_CONVERT) == mp->m_sb.sb_agblocks);
+		xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
+		/*
+		 * Change agf length.
+		 */
+		error = xfs_alloc_read_agf(mp, tp, agno, 0, &bp);
+		if (error) {
+			goto error0;
+		}
+		ASSERT(bp);
+		agf = XFS_BUF_TO_AGF(bp);
+		INT_MOD(agf->agf_length, ARCH_CONVERT, new);
+		ASSERT(INT_GET(agf->agf_length, ARCH_CONVERT) ==
+				INT_GET(agi->agi_length, ARCH_CONVERT));
+		/*
+		 * Free the new space.
+		 */
+		error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno,
+				INT_GET(agf->agf_length, ARCH_CONVERT) - new), new);
+		if (error) {
+			goto error0;
+		}
+	}
+	if (nagcount > oagcount)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
+	if (nb > mp->m_sb.sb_dblocks)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS,
+				 nb - mp->m_sb.sb_dblocks);
+	if (nfree)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree);
+	if (dpct)
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
+	error = xfs_trans_commit(tp, 0, NULL);
+	if (error) {
+		return error;
+	}
+	if (mp->m_sb.sb_imax_pct) {
+		__uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
+		do_div(icount, 100);
+		mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
+	} else
+		mp->m_maxicount = 0;
+	for (agno = 1; agno < nagcount; agno++) {
+		error = xfs_read_buf(mp, mp->m_ddev_targp,
+				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+				  sectbb, 0, &bp);
+		if (error) {
+			xfs_fs_cmn_err(CE_WARN, mp,
+			"error %d reading secondary superblock for ag %d\n",
+				error, agno);
+			break;
+		}
+		sbp = XFS_BUF_TO_SBP(bp);
+		xfs_xlatesb(sbp, &mp->m_sb, -1, ARCH_CONVERT, XFS_SB_ALL_BITS);
+		/*
+		 * If we get an error writing out the alternate superblocks,
+		 * just issue a warning and continue.  The real work is
+		 * already done and committed.
+		 */
+		if (!(error = xfs_bwrite(mp, bp))) {
+			continue;
+		} else {
+			xfs_fs_cmn_err(CE_WARN, mp,
+		"write error %d updating secondary superblock for ag %d\n",
+				error, agno);
+			break; /* no point in continuing */
+		}
+	}
+	return 0;
+
+ error0:
+	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+	return error;
+}
+
+static int
+xfs_growfs_log_private(
+	xfs_mount_t		*mp,	/* mount point for filesystem */
+	xfs_growfs_log_t	*in)	/* growfs log input struct */
+{
+	xfs_extlen_t		nb;
+
+	nb = in->newblocks;
+	if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES))
+		return XFS_ERROR(EINVAL);
+	if (nb == mp->m_sb.sb_logblocks &&
+	    in->isint == (mp->m_sb.sb_logstart != 0))
+		return XFS_ERROR(EINVAL);
+	/*
+	 * Moving the log is hard, need new interfaces to sync
+	 * the log first, hold off all activity while moving it.
+	 * Can have shorter or longer log in the same space,
+	 * or transform internal to external log or vice versa.
+	 */
+	return XFS_ERROR(ENOSYS);
+}
+
+/*
+ * protected versions of growfs function acquire and release locks on the mount
+ * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG,
+ * XFS_IOC_FSGROWFSRT
+ */
+
+
+int
+xfs_growfs_data(
+	xfs_mount_t		*mp,
+	xfs_growfs_data_t	*in)
+{
+	int error;
+	if (!cpsema(&mp->m_growlock))
+		return XFS_ERROR(EWOULDBLOCK);
+	error = xfs_growfs_data_private(mp, in);
+	vsema(&mp->m_growlock);
+	return error;
+}
+
+int
+xfs_growfs_log(
+	xfs_mount_t		*mp,
+	xfs_growfs_log_t	*in)
+{
+	int error;
+	if (!cpsema(&mp->m_growlock))
+		return XFS_ERROR(EWOULDBLOCK);
+	error = xfs_growfs_log_private(mp, in);
+	vsema(&mp->m_growlock);
+	return error;
+}
+
+/*
+ * exported through ioctl XFS_IOC_FSCOUNTS
+ */
+
+int
+xfs_fs_counts(
+	xfs_mount_t		*mp,
+	xfs_fsop_counts_t	*cnt)
+{
+	unsigned long	s;
+
+	s = XFS_SB_LOCK(mp);
+	cnt->freedata = mp->m_sb.sb_fdblocks;
+	cnt->freertx = mp->m_sb.sb_frextents;
+	cnt->freeino = mp->m_sb.sb_ifree;
+	cnt->allocino = mp->m_sb.sb_icount;
+	XFS_SB_UNLOCK(mp, s);
+	return 0;
+}
+
+/*
+ * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS
+ *
+ * xfs_reserve_blocks is called to set m_resblks
+ * in the in-core mount table. The number of unused reserved blocks
+ * is kept in m_resbls_avail.
+ *
+ * Reserve the requested number of blocks if available. Otherwise return
+ * as many as possible to satisfy the request. The actual number
+ * reserved are returned in outval
+ *
+ * A null inval pointer indicates that only the current reserved blocks
+ * available  should  be returned no settings are changed.
+ */
+
+int
+xfs_reserve_blocks(
+	xfs_mount_t		*mp,
+	__uint64_t		*inval,
+	xfs_fsop_resblks_t	*outval)
+{
+	__uint64_t		lcounter, delta;
+	__uint64_t		request;
+	unsigned long s;
+
+	/* If inval is null, report current values and return */
+
+	if (inval == (__uint64_t *)NULL) {
+		outval->resblks = mp->m_resblks;
+		outval->resblks_avail = mp->m_resblks_avail;
+		return(0);
+	}
+
+	request = *inval;
+	s = XFS_SB_LOCK(mp);
+
+	/*
+	 * If our previous reservation was larger than the current value,
+	 * then move any unused blocks back to the free pool.
+	 */
+
+	if (mp->m_resblks > request) {
+		lcounter = mp->m_resblks_avail - request;
+		if (lcounter  > 0) {		/* release unused blocks */
+			mp->m_sb.sb_fdblocks += lcounter;
+			mp->m_resblks_avail -= lcounter;
+		}
+		mp->m_resblks = request;
+	} else {
+		delta = request - mp->m_resblks;
+		lcounter = mp->m_sb.sb_fdblocks;
+		lcounter -= delta;
+		if (lcounter < 0) {
+			/* We can't satisfy the request, just get what we can */
+			mp->m_resblks += mp->m_sb.sb_fdblocks;
+			mp->m_resblks_avail += mp->m_sb.sb_fdblocks;
+			mp->m_sb.sb_fdblocks = 0;
+		} else {
+			mp->m_sb.sb_fdblocks = lcounter;
+			mp->m_resblks = request;
+			mp->m_resblks_avail += delta;
+		}
+	}
+
+	outval->resblks = mp->m_resblks;
+	outval->resblks_avail = mp->m_resblks_avail;
+	XFS_SB_UNLOCK(mp, s);
+	return(0);
+}
+
+void
+xfs_fs_log_dummy(xfs_mount_t *mp)
+{
+	xfs_trans_t *tp;
+	xfs_inode_t *ip;
+
+
+	tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
+	atomic_inc(&mp->m_active_trans);
+	if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) {
+		xfs_trans_cancel(tp, 0);
+		return;
+	}
+
+	ip = mp->m_rootip;
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ihold(tp, ip);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	xfs_trans_commit(tp, XFS_TRANS_SYNC, NULL);
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+}
+
+int
+xfs_fs_freeze(
+	xfs_mount_t	*mp)
+{
+	vfs_t		*vfsp;
+	/*REFERENCED*/
+	int		error;
+
+	vfsp = XFS_MTOVFS(mp);
+
+	/* Stop new writers */
+	xfs_start_freeze(mp, XFS_FREEZE_WRITE);
+
+	/* Flush the refcache */
+	xfs_refcache_purge_mp(mp);
+
+	/* Flush delalloc and delwri data */
+	VFS_SYNC(vfsp, SYNC_DELWRI|SYNC_WAIT, NULL, error);
+
+	/* Pause transaction subsystem */
+	xfs_start_freeze(mp, XFS_FREEZE_TRANS);
+
+	/* Flush any remaining inodes into buffers */
+	VFS_SYNC(vfsp, SYNC_ATTR|SYNC_WAIT, NULL, error);
+
+	/* Push all buffers out to disk */
+	xfs_binval(mp->m_ddev_targp);
+	if (mp->m_rtdev_targp) {
+		xfs_binval(mp->m_rtdev_targp);
+	}
+
+	/* Push the superblock and write an unmount record */
+	xfs_log_unmount_write(mp);
+	xfs_unmountfs_writesb(mp);
+
+	return 0;
+}
+
+
+int
+xfs_fs_thaw(
+	xfs_mount_t	*mp)
+{
+	xfs_finish_freeze(mp);
+	return 0;
+}
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
new file mode 100644
index 000000000000..c0caa7353e51
--- /dev/null
+++ b/fs/xfs/xfs_fsops.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_FSOPS_H__
+#define __XFS_FSOPS_H__
+
+int
+xfs_fs_geometry(
+	xfs_mount_t		*mp,
+	xfs_fsop_geom_t		*geo,
+	int			new_version);
+
+int
+xfs_growfs_data(
+	xfs_mount_t		*mp,
+	xfs_growfs_data_t	*in);
+
+int
+xfs_growfs_log(
+	xfs_mount_t		*mp,
+	xfs_growfs_log_t	*in);
+
+int
+xfs_fs_counts(
+	xfs_mount_t		*mp,
+	xfs_fsop_counts_t	*cnt);
+
+int
+xfs_reserve_blocks(
+	xfs_mount_t		*mp,
+	__uint64_t		*inval,
+	xfs_fsop_resblks_t	*outval);
+
+int
+xfs_fs_freeze(
+	xfs_mount_t		*mp);
+
+int
+xfs_fs_thaw(
+	xfs_mount_t		*mp);
+
+#endif	/* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
new file mode 100644
index 000000000000..185e62f08c70
--- /dev/null
+++ b/fs/xfs/xfs_ialloc.c
@@ -0,0 +1,1336 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+/*
+ * Log specified fields for the inode given by bp and off.
+ */
+STATIC void
+xfs_ialloc_log_di(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_buf_t	*bp,		/* inode buffer */
+	int		off,		/* index of inode in buffer */
+	int		fields)		/* bitmask of fields to log */
+{
+	int			first;		/* first byte number */
+	int			ioffset;	/* off in bytes */
+	int			last;		/* last byte number */
+	xfs_mount_t		*mp;		/* mount point structure */
+	static const short	offsets[] = {	/* field offsets */
+						/* keep in sync with bits */
+		offsetof(xfs_dinode_core_t, di_magic),
+		offsetof(xfs_dinode_core_t, di_mode),
+		offsetof(xfs_dinode_core_t, di_version),
+		offsetof(xfs_dinode_core_t, di_format),
+		offsetof(xfs_dinode_core_t, di_onlink),
+		offsetof(xfs_dinode_core_t, di_uid),
+		offsetof(xfs_dinode_core_t, di_gid),
+		offsetof(xfs_dinode_core_t, di_nlink),
+		offsetof(xfs_dinode_core_t, di_projid),
+		offsetof(xfs_dinode_core_t, di_pad),
+		offsetof(xfs_dinode_core_t, di_atime),
+		offsetof(xfs_dinode_core_t, di_mtime),
+		offsetof(xfs_dinode_core_t, di_ctime),
+		offsetof(xfs_dinode_core_t, di_size),
+		offsetof(xfs_dinode_core_t, di_nblocks),
+		offsetof(xfs_dinode_core_t, di_extsize),
+		offsetof(xfs_dinode_core_t, di_nextents),
+		offsetof(xfs_dinode_core_t, di_anextents),
+		offsetof(xfs_dinode_core_t, di_forkoff),
+		offsetof(xfs_dinode_core_t, di_aformat),
+		offsetof(xfs_dinode_core_t, di_dmevmask),
+		offsetof(xfs_dinode_core_t, di_dmstate),
+		offsetof(xfs_dinode_core_t, di_flags),
+		offsetof(xfs_dinode_core_t, di_gen),
+		offsetof(xfs_dinode_t, di_next_unlinked),
+		offsetof(xfs_dinode_t, di_u),
+		offsetof(xfs_dinode_t, di_a),
+		sizeof(xfs_dinode_t)
+	};
+
+
+	ASSERT(offsetof(xfs_dinode_t, di_core) == 0);
+	ASSERT((fields & (XFS_DI_U|XFS_DI_A)) == 0);
+	mp = tp->t_mountp;
+	/*
+	 * Get the inode-relative first and last bytes for these fields
+	 */
+	xfs_btree_offsets(fields, offsets, XFS_DI_NUM_BITS, &first, &last);
+	/*
+	 * Convert to buffer offsets and log it.
+	 */
+	ioffset = off << mp->m_sb.sb_inodelog;
+	first += ioffset;
+	last += ioffset;
+	xfs_trans_log_buf(tp, bp, first, last);
+}
+
+/*
+ * Allocation group level functions.
+ */
+
+/*
+ * Allocate new inodes in the allocation group specified by agbp.
+ * Return 0 for success, else error code.
+ */
+STATIC int				/* error code or 0 */
+xfs_ialloc_ag_alloc(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_buf_t	*agbp,		/* alloc group buffer */
+	int		*alloc)
+{
+	xfs_agi_t	*agi;		/* allocation group header */
+	xfs_alloc_arg_t args;		/* allocation argument structure */
+	int		blks_per_cluster;  /* fs blocks per inode cluster */
+	xfs_btree_cur_t *cur;		/* inode btree cursor */
+	xfs_daddr_t	d;		/* disk addr of buffer */
+	int		error;
+	xfs_buf_t	*fbuf;		/* new free inodes' buffer */
+	xfs_dinode_t	*free;		/* new free inode structure */
+	int		i;		/* inode counter */
+	int		j;		/* block counter */
+	int		nbufs;		/* num bufs of new inodes */
+	xfs_agino_t	newino;		/* new first inode's number */
+	xfs_agino_t	newlen;		/* new number of inodes */
+	int		ninodes;	/* num inodes per buf */
+	xfs_agino_t	thisino;	/* current inode number, for loop */
+	int		version;	/* inode version number to use */
+	static xfs_timestamp_t ztime;	/* zero xfs timestamp */
+	int		isaligned;	/* inode allocation at stripe unit */
+					/* boundary */
+	xfs_dinode_core_t dic;		/* a dinode_core to copy to new */
+					/* inodes */
+
+	args.tp = tp;
+	args.mp = tp->t_mountp;
+
+	/*
+	 * Locking will ensure that we don't have two callers in here
+	 * at one time.
+	 */
+	newlen = XFS_IALLOC_INODES(args.mp);
+	if (args.mp->m_maxicount &&
+	    args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
+		return XFS_ERROR(ENOSPC);
+	args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp);
+	/*
+	 * Set the alignment for the allocation.
+	 * If stripe alignment is turned on then align at stripe unit
+	 * boundary.
+	 * If the cluster size is smaller than a filesystem block
+	 * then we're doing I/O for inodes in filesystem block size pieces,
+	 * so don't need alignment anyway.
+	 */
+	isaligned = 0;
+	if (args.mp->m_sinoalign) {
+		ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
+		args.alignment = args.mp->m_dalign;
+		isaligned = 1;
+	} else if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) &&
+	    args.mp->m_sb.sb_inoalignmt >=
+	    XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp)))
+		args.alignment = args.mp->m_sb.sb_inoalignmt;
+	else
+		args.alignment = 1;
+	agi = XFS_BUF_TO_AGI(agbp);
+	/*
+	 * Need to figure out where to allocate the inode blocks.
+	 * Ideally they should be spaced out through the a.g.
+	 * For now, just allocate blocks up front.
+	 */
+	args.agbno = INT_GET(agi->agi_root, ARCH_CONVERT);
+	args.fsbno = XFS_AGB_TO_FSB(args.mp, INT_GET(agi->agi_seqno, ARCH_CONVERT),
+				    args.agbno);
+	/*
+	 * Allocate a fixed-size extent of inodes.
+	 */
+	args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	args.mod = args.total = args.wasdel = args.isfl = args.userdata =
+		args.minalignslop = 0;
+	args.prod = 1;
+	/*
+	 * Allow space for the inode btree to split.
+	 */
+	args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
+	if ((error = xfs_alloc_vextent(&args)))
+		return error;
+
+	/*
+	 * If stripe alignment is turned on, then try again with cluster
+	 * alignment.
+	 */
+	if (isaligned && args.fsbno == NULLFSBLOCK) {
+		args.type = XFS_ALLOCTYPE_NEAR_BNO;
+		args.agbno = INT_GET(agi->agi_root, ARCH_CONVERT);
+		args.fsbno = XFS_AGB_TO_FSB(args.mp,
+				INT_GET(agi->agi_seqno, ARCH_CONVERT), args.agbno);
+		if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) &&
+			args.mp->m_sb.sb_inoalignmt >=
+			XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp)))
+				args.alignment = args.mp->m_sb.sb_inoalignmt;
+		else
+			args.alignment = 1;
+		if ((error = xfs_alloc_vextent(&args)))
+			 return error;
+	}
+
+	if (args.fsbno == NULLFSBLOCK) {
+		*alloc = 0;
+		return 0;
+	}
+	ASSERT(args.len == args.minlen);
+	/*
+	 * Convert the results.
+	 */
+	newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
+	/*
+	 * Loop over the new block(s), filling in the inodes.
+	 * For small block sizes, manipulate the inodes in buffers
+	 * which are multiples of the blocks size.
+	 */
+	if (args.mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(args.mp)) {
+		blks_per_cluster = 1;
+		nbufs = (int)args.len;
+		ninodes = args.mp->m_sb.sb_inopblock;
+	} else {
+		blks_per_cluster = XFS_INODE_CLUSTER_SIZE(args.mp) /
+				   args.mp->m_sb.sb_blocksize;
+		nbufs = (int)args.len / blks_per_cluster;
+		ninodes = blks_per_cluster * args.mp->m_sb.sb_inopblock;
+	}
+	/*
+	 * Figure out what version number to use in the inodes we create.
+	 * If the superblock version has caught up to the one that supports
+	 * the new inode format, then use the new inode version.  Otherwise
+	 * use the old version so that old kernels will continue to be
+	 * able to use the file system.
+	 */
+	if (XFS_SB_VERSION_HASNLINK(&args.mp->m_sb))
+		version = XFS_DINODE_VERSION_2;
+	else
+		version = XFS_DINODE_VERSION_1;
+	for (j = 0; j < nbufs; j++) {
+		/*
+		 * Get the block.
+		 */
+		d = XFS_AGB_TO_DADDR(args.mp, INT_GET(agi->agi_seqno, ARCH_CONVERT),
+				     args.agbno + (j * blks_per_cluster));
+		fbuf = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, d,
+					 args.mp->m_bsize * blks_per_cluster,
+					 XFS_BUF_LOCK);
+		ASSERT(fbuf);
+		ASSERT(!XFS_BUF_GETERROR(fbuf));
+		/*
+		 * Loop over the inodes in this buffer.
+		 */
+		INT_SET(dic.di_magic, ARCH_CONVERT, XFS_DINODE_MAGIC);
+		INT_ZERO(dic.di_mode, ARCH_CONVERT);
+		INT_SET(dic.di_version, ARCH_CONVERT, version);
+		INT_ZERO(dic.di_format, ARCH_CONVERT);
+		INT_ZERO(dic.di_onlink, ARCH_CONVERT);
+		INT_ZERO(dic.di_uid, ARCH_CONVERT);
+		INT_ZERO(dic.di_gid, ARCH_CONVERT);
+		INT_ZERO(dic.di_nlink, ARCH_CONVERT);
+		INT_ZERO(dic.di_projid, ARCH_CONVERT);
+		bzero(&(dic.di_pad[0]),sizeof(dic.di_pad));
+		INT_SET(dic.di_atime.t_sec, ARCH_CONVERT, ztime.t_sec);
+		INT_SET(dic.di_atime.t_nsec, ARCH_CONVERT, ztime.t_nsec);
+
+		INT_SET(dic.di_mtime.t_sec, ARCH_CONVERT, ztime.t_sec);
+		INT_SET(dic.di_mtime.t_nsec, ARCH_CONVERT, ztime.t_nsec);
+
+		INT_SET(dic.di_ctime.t_sec, ARCH_CONVERT, ztime.t_sec);
+		INT_SET(dic.di_ctime.t_nsec, ARCH_CONVERT, ztime.t_nsec);
+
+		INT_ZERO(dic.di_size, ARCH_CONVERT);
+		INT_ZERO(dic.di_nblocks, ARCH_CONVERT);
+		INT_ZERO(dic.di_extsize, ARCH_CONVERT);
+		INT_ZERO(dic.di_nextents, ARCH_CONVERT);
+		INT_ZERO(dic.di_anextents, ARCH_CONVERT);
+		INT_ZERO(dic.di_forkoff, ARCH_CONVERT);
+		INT_ZERO(dic.di_aformat, ARCH_CONVERT);
+		INT_ZERO(dic.di_dmevmask, ARCH_CONVERT);
+		INT_ZERO(dic.di_dmstate, ARCH_CONVERT);
+		INT_ZERO(dic.di_flags, ARCH_CONVERT);
+		INT_ZERO(dic.di_gen, ARCH_CONVERT);
+
+		for (i = 0; i < ninodes; i++) {
+			free = XFS_MAKE_IPTR(args.mp, fbuf, i);
+			bcopy (&dic, &(free->di_core), sizeof(xfs_dinode_core_t));
+			INT_SET(free->di_next_unlinked, ARCH_CONVERT, NULLAGINO);
+			xfs_ialloc_log_di(tp, fbuf, i,
+				XFS_DI_CORE_BITS | XFS_DI_NEXT_UNLINKED);
+		}
+		xfs_trans_inode_alloc_buf(tp, fbuf);
+	}
+	INT_MOD(agi->agi_count, ARCH_CONVERT, newlen);
+	INT_MOD(agi->agi_freecount, ARCH_CONVERT, newlen);
+	down_read(&args.mp->m_peraglock);
+	args.mp->m_perag[INT_GET(agi->agi_seqno, ARCH_CONVERT)].pagi_freecount += newlen;
+	up_read(&args.mp->m_peraglock);
+	INT_SET(agi->agi_newino, ARCH_CONVERT, newino);
+	/*
+	 * Insert records describing the new inode chunk into the btree.
+	 */
+	cur = xfs_btree_init_cursor(args.mp, tp, agbp,
+			INT_GET(agi->agi_seqno, ARCH_CONVERT),
+			XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
+	for (thisino = newino;
+	     thisino < newino + newlen;
+	     thisino += XFS_INODES_PER_CHUNK) {
+		if ((error = xfs_inobt_lookup_eq(cur, thisino,
+				XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i))) {
+			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+			return error;
+		}
+		ASSERT(i == 0);
+		if ((error = xfs_inobt_insert(cur, &i))) {
+			xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+			return error;
+		}
+		ASSERT(i == 1);
+	}
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	/*
+	 * Log allocation group header fields
+	 */
+	xfs_ialloc_log_agi(tp, agbp,
+		XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
+	/*
+	 * Modify/log superblock values for inode count and inode free count.
+	 */
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
+	*alloc = 1;
+	return 0;
+}
+
+/*
+ * Select an allocation group to look for a free inode in, based on the parent
+ * inode and then mode.	 Return the allocation group buffer.
+ */
+STATIC xfs_buf_t *			/* allocation group buffer */
+xfs_ialloc_ag_select(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_ino_t	parent,		/* parent directory inode number */
+	mode_t		mode,		/* bits set to indicate file type */
+	int		okalloc)	/* ok to allocate more space */
+{
+	xfs_buf_t	*agbp;		/* allocation group header buffer */
+	xfs_agnumber_t	agcount;	/* number of ag's in the filesystem */
+	xfs_agnumber_t	agno;		/* current ag number */
+	int		flags;		/* alloc buffer locking flags */
+	xfs_extlen_t	ineed;		/* blocks needed for inode allocation */
+	xfs_extlen_t	longest = 0;	/* longest extent available */
+	xfs_mount_t	*mp;		/* mount point structure */
+	int		needspace;	/* file mode implies space allocated */
+	xfs_perag_t	*pag;		/* per allocation group data */
+	xfs_agnumber_t	pagno;		/* parent (starting) ag number */
+
+	/*
+	 * Files of these types need at least one block if length > 0
+	 * (and they won't fit in the inode, but that's hard to figure out).
+	 */
+	needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
+	mp = tp->t_mountp;
+	agcount = mp->m_maxagi;
+	if (S_ISDIR(mode))
+		pagno = atomicIncWithWrap((int *)&mp->m_agirotor, agcount);
+	else {
+		pagno = XFS_INO_TO_AGNO(mp, parent);
+		if (pagno >= agcount)
+			pagno = 0;
+	}
+	ASSERT(pagno < agcount);
+	/*
+	 * Loop through allocation groups, looking for one with a little
+	 * free space in it.  Note we don't look for free inodes, exactly.
+	 * Instead, we include whether there is a need to allocate inodes
+	 * to mean that blocks must be allocated for them,
+	 * if none are currently free.
+	 */
+	agno = pagno;
+	flags = XFS_ALLOC_FLAG_TRYLOCK;
+	down_read(&mp->m_peraglock);
+	for (;;) {
+		pag = &mp->m_perag[agno];
+		if (!pag->pagi_init) {
+			if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
+				agbp = NULL;
+				goto nextag;
+			}
+		} else
+			agbp = NULL;
+
+		if (!pag->pagi_inodeok) {
+			atomicIncWithWrap((int *)&mp->m_agirotor, agcount);
+			goto unlock_nextag;
+		}
+
+		/*
+		 * Is there enough free space for the file plus a block
+		 * of inodes (if we need to allocate some)?
+		 */
+		ineed = pag->pagi_freecount ? 0 : XFS_IALLOC_BLOCKS(mp);
+		if (ineed && !pag->pagf_init) {
+			if (agbp == NULL &&
+			    xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
+				agbp = NULL;
+				goto nextag;
+			}
+			(void)xfs_alloc_pagf_init(mp, tp, agno, flags);
+		}
+		if (!ineed || pag->pagf_init) {
+			if (ineed && !(longest = pag->pagf_longest))
+				longest = pag->pagf_flcount > 0;
+			if (!ineed ||
+			    (pag->pagf_freeblks >= needspace + ineed &&
+			     longest >= ineed &&
+			     okalloc)) {
+				if (agbp == NULL &&
+				    xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
+					agbp = NULL;
+					goto nextag;
+				}
+				up_read(&mp->m_peraglock);
+				return agbp;
+			}
+		}
+unlock_nextag:
+		if (agbp)
+			xfs_trans_brelse(tp, agbp);
+nextag:
+		/*
+		 * No point in iterating over the rest, if we're shutting
+		 * down.
+		 */
+		if (XFS_FORCED_SHUTDOWN(mp)) {
+			up_read(&mp->m_peraglock);
+			return (xfs_buf_t *)0;
+		}
+		agno++;
+		if (agno >= agcount)
+			agno = 0;
+		if (agno == pagno) {
+			if (flags == 0) {
+				up_read(&mp->m_peraglock);
+				return (xfs_buf_t *)0;
+			}
+			flags = 0;
+		}
+	}
+}
+
+/*
+ * Visible inode allocation functions.
+ */
+
+/*
+ * Allocate an inode on disk.
+ * Mode is used to tell whether the new inode will need space, and whether
+ * it is a directory.
+ *
+ * The arguments IO_agbp and alloc_done are defined to work within
+ * the constraint of one allocation per transaction.
+ * xfs_dialloc() is designed to be called twice if it has to do an
+ * allocation to make more free inodes.	 On the first call,
+ * IO_agbp should be set to NULL. If an inode is available,
+ * i.e., xfs_dialloc() did not need to do an allocation, an inode
+ * number is returned.	In this case, IO_agbp would be set to the
+ * current ag_buf and alloc_done set to false.
+ * If an allocation needed to be done, xfs_dialloc would return
+ * the current ag_buf in IO_agbp and set alloc_done to true.
+ * The caller should then commit the current transaction, allocate a new
+ * transaction, and call xfs_dialloc() again, passing in the previous
+ * value of IO_agbp.  IO_agbp should be held across the transactions.
+ * Since the agbp is locked across the two calls, the second call is
+ * guaranteed to have a free inode available.
+ *
+ * Once we successfully pick an inode its number is returned and the
+ * on-disk data structures are updated.	 The inode itself is not read
+ * in, since doing so would break ordering constraints with xfs_reclaim.
+ */
+int
+xfs_dialloc(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_ino_t	parent,		/* parent inode (directory) */
+	mode_t		mode,		/* mode bits for new inode */
+	int		okalloc,	/* ok to allocate more space */
+	xfs_buf_t	**IO_agbp,	/* in/out ag header's buffer */
+	boolean_t	*alloc_done,	/* true if we needed to replenish
+					   inode freelist */
+	xfs_ino_t	*inop)		/* inode number allocated */
+{
+	xfs_agnumber_t	agcount;	/* number of allocation groups */
+	xfs_buf_t	*agbp;		/* allocation group header's buffer */
+	xfs_agnumber_t	agno;		/* allocation group number */
+	xfs_agi_t	*agi;		/* allocation group header structure */
+	xfs_btree_cur_t *cur;		/* inode allocation btree cursor */
+	int		error;		/* error return value */
+	int		i;		/* result code */
+	int		ialloced;	/* inode allocation status */
+	int		noroom = 0;	/* no space for inode blk allocation */
+	xfs_ino_t	ino;		/* fs-relative inode to be returned */
+	/* REFERENCED */
+	int		j;		/* result code */
+	xfs_mount_t	*mp;		/* file system mount structure */
+	int		offset;		/* index of inode in chunk */
+	xfs_agino_t	pagino;		/* parent's a.g. relative inode # */
+	xfs_agnumber_t	pagno;		/* parent's allocation group number */
+	xfs_inobt_rec_t rec;		/* inode allocation record */
+	xfs_agnumber_t	tagno;		/* testing allocation group number */
+	xfs_btree_cur_t *tcur;		/* temp cursor */
+	xfs_inobt_rec_t trec;		/* temp inode allocation record */
+
+
+	if (*IO_agbp == NULL) {
+		/*
+		 * We do not have an agbp, so select an initial allocation
+		 * group for inode allocation.
+		 */
+		agbp = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
+		/*
+		 * Couldn't find an allocation group satisfying the
+		 * criteria, give up.
+		 */
+		if (!agbp) {
+			*inop = NULLFSINO;
+			return 0;
+		}
+		agi = XFS_BUF_TO_AGI(agbp);
+		ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
+	} else {
+		/*
+		 * Continue where we left off before.  In this case, we
+		 * know that the allocation group has free inodes.
+		 */
+		agbp = *IO_agbp;
+		agi = XFS_BUF_TO_AGI(agbp);
+		ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
+		ASSERT(INT_GET(agi->agi_freecount, ARCH_CONVERT) > 0);
+	}
+	mp = tp->t_mountp;
+	agcount = mp->m_sb.sb_agcount;
+	agno = INT_GET(agi->agi_seqno, ARCH_CONVERT);
+	tagno = agno;
+	pagno = XFS_INO_TO_AGNO(mp, parent);
+	pagino = XFS_INO_TO_AGINO(mp, parent);
+
+	/*
+	 * If we have already hit the ceiling of inode blocks then clear
+	 * okalloc so we scan all available agi structures for a free
+	 * inode.
+	 */
+
+	if (mp->m_maxicount &&
+	    mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) {
+		noroom = 1;
+		okalloc = 0;
+	}
+
+	/*
+	 * Loop until we find an allocation group that either has free inodes
+	 * or in which we can allocate some inodes.  Iterate through the
+	 * allocation groups upward, wrapping at the end.
+	 */
+	*alloc_done = B_FALSE;
+	while (INT_ISZERO(agi->agi_freecount, ARCH_CONVERT)) {
+		/*
+		 * Don't do anything if we're not supposed to allocate
+		 * any blocks, just go on to the next ag.
+		 */
+		if (okalloc) {
+			/*
+			 * Try to allocate some new inodes in the allocation
+			 * group.
+			 */
+			if ((error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced))) {
+				xfs_trans_brelse(tp, agbp);
+				if (error == ENOSPC) {
+					*inop = NULLFSINO;
+					return 0;
+				} else
+					return error;
+			}
+			if (ialloced) {
+				/*
+				 * We successfully allocated some inodes, return
+				 * the current context to the caller so that it
+				 * can commit the current transaction and call
+				 * us again where we left off.
+				 */
+				ASSERT(INT_GET(agi->agi_freecount, ARCH_CONVERT) > 0);
+				*alloc_done = B_TRUE;
+				*IO_agbp = agbp;
+				*inop = NULLFSINO;
+				return 0;
+			}
+		}
+		/*
+		 * If it failed, give up on this ag.
+		 */
+		xfs_trans_brelse(tp, agbp);
+		/*
+		 * Go on to the next ag: get its ag header.
+		 */
+nextag:
+		if (++tagno == agcount)
+			tagno = 0;
+		if (tagno == agno) {
+			*inop = NULLFSINO;
+			return noroom ? ENOSPC : 0;
+		}
+		down_read(&mp->m_peraglock);
+		if (mp->m_perag[tagno].pagi_inodeok == 0) {
+			up_read(&mp->m_peraglock);
+			goto nextag;
+		}
+		error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp);
+		up_read(&mp->m_peraglock);
+		if (error)
+			goto nextag;
+		agi = XFS_BUF_TO_AGI(agbp);
+		ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
+	}
+	/*
+	 * Here with an allocation group that has a free inode.
+	 * Reset agno since we may have chosen a new ag in the
+	 * loop above.
+	 */
+	agno = tagno;
+	*IO_agbp = NULL;
+	cur = xfs_btree_init_cursor(mp, tp, agbp, INT_GET(agi->agi_seqno, ARCH_CONVERT),
+				    XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
+	/*
+	 * If pagino is 0 (this is the root inode allocation) use newino.
+	 * This must work because we've just allocated some.
+	 */
+	if (!pagino)
+		pagino = INT_GET(agi->agi_newino, ARCH_CONVERT);
+#ifdef DEBUG
+	if (cur->bc_nlevels == 1) {
+		int	freecount = 0;
+
+		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		do {
+			if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
+					&rec.ir_freecount, &rec.ir_free, &i, ARCH_NOCONVERT)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			freecount += rec.ir_freecount;
+			if ((error = xfs_inobt_increment(cur, 0, &i)))
+				goto error0;
+		} while (i == 1);
+
+		ASSERT(freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT) ||
+		       XFS_FORCED_SHUTDOWN(mp));
+	}
+#endif
+	/*
+	 * If in the same a.g. as the parent, try to get near the parent.
+	 */
+	if (pagno == agno) {
+		if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i)))
+			goto error0;
+		if (i != 0 &&
+		    (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
+			    &rec.ir_freecount, &rec.ir_free, &j, ARCH_NOCONVERT)) == 0 &&
+		    j == 1 &&
+		    rec.ir_freecount > 0) {
+			/*
+			 * Found a free inode in the same chunk
+			 * as parent, done.
+			 */
+		}
+		/*
+		 * In the same a.g. as parent, but parent's chunk is full.
+		 */
+		else {
+			int	doneleft;	/* done, to the left */
+			int	doneright;	/* done, to the right */
+
+			if (error)
+				goto error0;
+			ASSERT(i == 1);
+			ASSERT(j == 1);
+			/*
+			 * Duplicate the cursor, search left & right
+			 * simultaneously.
+			 */
+			if ((error = xfs_btree_dup_cursor(cur, &tcur)))
+				goto error0;
+			/*
+			 * Search left with tcur, back up 1 record.
+			 */
+			if ((error = xfs_inobt_decrement(tcur, 0, &i)))
+				goto error1;
+			doneleft = !i;
+			if (!doneleft) {
+				if ((error = xfs_inobt_get_rec(tcur,
+						&trec.ir_startino,
+						&trec.ir_freecount,
+						&trec.ir_free, &i, ARCH_NOCONVERT)))
+					goto error1;
+				XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
+			}
+			/*
+			 * Search right with cur, go forward 1 record.
+			 */
+			if ((error = xfs_inobt_increment(cur, 0, &i)))
+				goto error1;
+			doneright = !i;
+			if (!doneright) {
+				if ((error = xfs_inobt_get_rec(cur,
+						&rec.ir_startino,
+						&rec.ir_freecount,
+						&rec.ir_free, &i, ARCH_NOCONVERT)))
+					goto error1;
+				XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
+			}
+			/*
+			 * Loop until we find the closest inode chunk
+			 * with a free one.
+			 */
+			while (!doneleft || !doneright) {
+				int	useleft;  /* using left inode
+						     chunk this time */
+
+				/*
+				 * Figure out which block is closer,
+				 * if both are valid.
+				 */
+				if (!doneleft && !doneright)
+					useleft =
+						pagino -
+						(trec.ir_startino +
+						 XFS_INODES_PER_CHUNK - 1) <
+						 rec.ir_startino - pagino;
+				else
+					useleft = !doneleft;
+				/*
+				 * If checking the left, does it have
+				 * free inodes?
+				 */
+				if (useleft && trec.ir_freecount) {
+					/*
+					 * Yes, set it up as the chunk to use.
+					 */
+					rec = trec;
+					xfs_btree_del_cursor(cur,
+						XFS_BTREE_NOERROR);
+					cur = tcur;
+					break;
+				}
+				/*
+				 * If checking the right, does it have
+				 * free inodes?
+				 */
+				if (!useleft && rec.ir_freecount) {
+					/*
+					 * Yes, it's already set up.
+					 */
+					xfs_btree_del_cursor(tcur,
+						XFS_BTREE_NOERROR);
+					break;
+				}
+				/*
+				 * If used the left, get another one
+				 * further left.
+				 */
+				if (useleft) {
+					if ((error = xfs_inobt_decrement(tcur, 0,
+							&i)))
+						goto error1;
+					doneleft = !i;
+					if (!doneleft) {
+						if ((error = xfs_inobt_get_rec(
+							    tcur,
+							    &trec.ir_startino,
+							    &trec.ir_freecount,
+							    &trec.ir_free, &i, ARCH_NOCONVERT)))
+							goto error1;
+						XFS_WANT_CORRUPTED_GOTO(i == 1,
+							error1);
+					}
+				}
+				/*
+				 * If used the right, get another one
+				 * further right.
+				 */
+				else {
+					if ((error = xfs_inobt_increment(cur, 0,
+							&i)))
+						goto error1;
+					doneright = !i;
+					if (!doneright) {
+						if ((error = xfs_inobt_get_rec(
+							    cur,
+							    &rec.ir_startino,
+							    &rec.ir_freecount,
+							    &rec.ir_free, &i, ARCH_NOCONVERT)))
+							goto error1;
+						XFS_WANT_CORRUPTED_GOTO(i == 1,
+							error1);
+					}
+				}
+			}
+			ASSERT(!doneleft || !doneright);
+		}
+	}
+	/*
+	 * In a different a.g. from the parent.
+	 * See if the most recently allocated block has any free.
+	 */
+	else if (INT_GET(agi->agi_newino, ARCH_CONVERT) != NULLAGINO) {
+		if ((error = xfs_inobt_lookup_eq(cur,
+				INT_GET(agi->agi_newino, ARCH_CONVERT), 0, 0, &i)))
+			goto error0;
+		if (i == 1 &&
+		    (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
+			    &rec.ir_freecount, &rec.ir_free, &j, ARCH_NOCONVERT)) == 0 &&
+		    j == 1 &&
+		    rec.ir_freecount > 0) {
+			/*
+			 * The last chunk allocated in the group still has
+			 * a free inode.
+			 */
+		}
+		/*
+		 * None left in the last group, search the whole a.g.
+		 */
+		else {
+			if (error)
+				goto error0;
+			if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
+				goto error0;
+			ASSERT(i == 1);
+			for (;;) {
+				if ((error = xfs_inobt_get_rec(cur,
+						&rec.ir_startino,
+						&rec.ir_freecount, &rec.ir_free,
+						&i, ARCH_NOCONVERT)))
+					goto error0;
+				XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+				if (rec.ir_freecount > 0)
+					break;
+				if ((error = xfs_inobt_increment(cur, 0, &i)))
+					goto error0;
+				XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			}
+		}
+	}
+	offset = XFS_IALLOC_FIND_FREE(&rec.ir_free);
+	ASSERT(offset >= 0);
+	ASSERT(offset < XFS_INODES_PER_CHUNK);
+	ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
+				   XFS_INODES_PER_CHUNK) == 0);
+	ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
+	XFS_INOBT_CLR_FREE(&rec, offset, ARCH_NOCONVERT);
+	rec.ir_freecount--;
+	if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
+			rec.ir_free)))
+		goto error0;
+	INT_MOD(agi->agi_freecount, ARCH_CONVERT, -1);
+	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+	down_read(&mp->m_peraglock);
+	mp->m_perag[tagno].pagi_freecount--;
+	up_read(&mp->m_peraglock);
+#ifdef DEBUG
+	if (cur->bc_nlevels == 1) {
+		int	freecount = 0;
+
+		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
+			goto error0;
+		do {
+			if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
+					&rec.ir_freecount, &rec.ir_free, &i, ARCH_NOCONVERT)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			freecount += rec.ir_freecount;
+			if ((error = xfs_inobt_increment(cur, 0, &i)))
+				goto error0;
+		} while (i == 1);
+		ASSERT(freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT) ||
+		       XFS_FORCED_SHUTDOWN(mp));
+	}
+#endif
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+	*inop = ino;
+	return 0;
+error1:
+	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+error0:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Free disk inode.  Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
+ */
+int
+xfs_difree(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_ino_t	inode)		/* inode to be freed */
+{
+	/* REFERENCED */
+	xfs_agblock_t	agbno;	/* block number containing inode */
+	xfs_buf_t	*agbp;	/* buffer containing allocation group header */
+	xfs_agino_t	agino;	/* inode number relative to allocation group */
+	xfs_agnumber_t	agno;	/* allocation group number */
+	xfs_agi_t	*agi;	/* allocation group header */
+	xfs_btree_cur_t *cur;	/* inode btree cursor */
+	int		error;	/* error return value */
+	int		i;	/* result code */
+	xfs_mount_t	*mp;	/* mount structure for filesystem */
+	int		off;	/* offset of inode in inode chunk */
+	xfs_inobt_rec_t rec;	/* btree record */
+
+	mp = tp->t_mountp;
+
+	/*
+	 * Break up inode number into its components.
+	 */
+	agno = XFS_INO_TO_AGNO(mp, inode);
+	if (agno >= mp->m_sb.sb_agcount)  {
+		cmn_err(CE_WARN,
+			"xfs_difree: agno >= mp->m_sb.sb_agcount (%d >= %d) on %s.  Returning EINVAL.",
+			agno, mp->m_sb.sb_agcount, mp->m_fsname);
+		ASSERT(0);
+		return XFS_ERROR(EINVAL);
+	}
+	agino = XFS_INO_TO_AGINO(mp, inode);
+	if (inode != XFS_AGINO_TO_INO(mp, agno, agino))	 {
+		cmn_err(CE_WARN,
+			"xfs_difree: inode != XFS_AGINO_TO_INO() (%d != %d) on %s.  Returning EINVAL.",
+			inode, XFS_AGINO_TO_INO(mp, agno, agino), mp->m_fsname);
+		ASSERT(0);
+		return XFS_ERROR(EINVAL);
+	}
+	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+	if (agbno >= mp->m_sb.sb_agblocks)  {
+		cmn_err(CE_WARN,
+			"xfs_difree: agbno >= mp->m_sb.sb_agblocks (%d >= %d) on %s.  Returning EINVAL.",
+			agbno, mp->m_sb.sb_agblocks, mp->m_fsname);
+		ASSERT(0);
+		return XFS_ERROR(EINVAL);
+	}
+	/*
+	 * Get the allocation group header.
+	 */
+	down_read(&mp->m_peraglock);
+	error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+	up_read(&mp->m_peraglock);
+	if (error) {
+		cmn_err(CE_WARN,
+			"xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s.	Returning error.",
+			error, mp->m_fsname);
+		return error;
+	}
+	agi = XFS_BUF_TO_AGI(agbp);
+	ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
+	ASSERT(agbno < INT_GET(agi->agi_length, ARCH_CONVERT));
+	/*
+	 * Initialize the cursor.
+	 */
+	cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
+		(xfs_inode_t *)0, 0);
+#ifdef DEBUG
+	if (cur->bc_nlevels == 1) {
+		int freecount = 0;
+
+		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
+			goto error0;
+		do {
+			if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
+					&rec.ir_freecount, &rec.ir_free, &i, ARCH_NOCONVERT)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			freecount += rec.ir_freecount;
+			if ((error = xfs_inobt_increment(cur, 0, &i)))
+				goto error0;
+		} while (i == 1);
+		ASSERT(freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT) ||
+		       XFS_FORCED_SHUTDOWN(mp));
+	}
+#endif
+	/*
+	 * Look for the entry describing this inode.
+	 */
+	if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
+		cmn_err(CE_WARN,
+			"xfs_difree: xfs_inobt_lookup_le returned()  an error %d on %s.	 Returning error.",
+			error, mp->m_fsname);
+		goto error0;
+	}
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, &rec.ir_freecount,
+			&rec.ir_free, &i, ARCH_NOCONVERT))) {
+		cmn_err(CE_WARN,
+			"xfs_difree: xfs_inobt_get_rec()  returned an error %d on %s.  Returning error.",
+			error, mp->m_fsname);
+		goto error0;
+	}
+	XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+	/*
+	 * Get the offset in the inode chunk.
+	 */
+	off = agino - rec.ir_startino;
+	ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
+	ASSERT(!XFS_INOBT_IS_FREE(&rec, off, ARCH_NOCONVERT));
+	/*
+	 * Mark the inode free & increment the count.
+	 */
+	XFS_INOBT_SET_FREE(&rec, off, ARCH_NOCONVERT);
+	rec.ir_freecount++;
+	if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) {
+		cmn_err(CE_WARN,
+			"xfs_difree: xfs_inobt_update()	 returned an error %d on %s.  Returning error.",
+			error, mp->m_fsname);
+		goto error0;
+	}
+	/*
+	 * Change the inode free counts and log the ag/sb changes.
+	 */
+	INT_MOD(agi->agi_freecount, ARCH_CONVERT, 1);
+	xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+	down_read(&mp->m_peraglock);
+	mp->m_perag[agno].pagi_freecount++;
+	up_read(&mp->m_peraglock);
+#ifdef DEBUG
+	if (cur->bc_nlevels == 1) {
+		int freecount = 0;
+
+		if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
+			goto error0;
+		do {
+			if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
+					&rec.ir_freecount, &rec.ir_free, &i, ARCH_NOCONVERT)))
+				goto error0;
+			XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+			freecount += rec.ir_freecount;
+			if ((error = xfs_inobt_increment(cur, 0, &i)))
+				goto error0;
+		} while (i == 1);
+		ASSERT(freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT) ||
+		       XFS_FORCED_SHUTDOWN(mp));
+	}
+#endif
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
+	return 0;
+
+error0:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Return the location of the inode in bno/off, for mapping it into a buffer.
+ */
+/*ARGSUSED*/
+int
+xfs_dilocate(
+	xfs_mount_t	*mp,	/* file system mount structure */
+	xfs_trans_t	*tp,	/* transaction pointer */
+	xfs_ino_t	ino,	/* inode to locate */
+	xfs_fsblock_t	*bno,	/* output: block containing inode */
+	int		*len,	/* output: num blocks in inode cluster */
+	int		*off,	/* output: index in block of inode */
+	uint		flags)	/* flags concerning inode lookup */
+{
+	xfs_agblock_t	agbno;	/* block number of inode in the alloc group */
+	xfs_buf_t	*agbp;	/* agi buffer */
+	xfs_agino_t	agino;	/* inode number within alloc group */
+	xfs_agnumber_t	agno;	/* allocation group number */
+	int		blks_per_cluster; /* num blocks per inode cluster */
+	xfs_agblock_t	chunk_agbno;	/* first block in inode chunk */
+	xfs_agino_t	chunk_agino;	/* first agino in inode chunk */
+	__int32_t	chunk_cnt;	/* count of free inodes in chunk */
+	xfs_inofree_t	chunk_free;	/* mask of free inodes in chunk */
+	xfs_agblock_t	cluster_agbno;	/* first block in inode cluster */
+	xfs_btree_cur_t *cur;	/* inode btree cursor */
+	int		error;	/* error code */
+	int		i;	/* temp state */
+	int		offset; /* index of inode in its buffer */
+	int		offset_agbno;	/* blks from chunk start to inode */
+
+	ASSERT(ino != NULLFSINO);
+	/*
+	 * Split up the inode number into its parts.
+	 */
+	agno = XFS_INO_TO_AGNO(mp, ino);
+	agino = XFS_INO_TO_AGINO(mp, ino);
+	agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+	if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
+	    ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+#ifdef DEBUG
+		if (agno >= mp->m_sb.sb_agcount) {
+			xfs_fs_cmn_err(CE_ALERT, mp,
+					"xfs_dilocate: agno (%d) >= "
+					"mp->m_sb.sb_agcount (%d)",
+					agno,  mp->m_sb.sb_agcount);
+		}
+		if (agbno >= mp->m_sb.sb_agblocks) {
+			xfs_fs_cmn_err(CE_ALERT, mp,
+					"xfs_dilocate: agbno (0x%llx) >= "
+					"mp->m_sb.sb_agblocks (0x%lx)",
+					(unsigned long long) agbno,
+					(unsigned long) mp->m_sb.sb_agblocks);
+		}
+		if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+			xfs_fs_cmn_err(CE_ALERT, mp,
+					"xfs_dilocate: ino (0x%llx) != "
+					"XFS_AGINO_TO_INO(mp, agno, agino) "
+					"(0x%llx)",
+					ino, XFS_AGINO_TO_INO(mp, agno, agino));
+		}
+#endif /* DEBUG */
+		return XFS_ERROR(EINVAL);
+	}
+	if ((mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) ||
+	    !(flags & XFS_IMAP_LOOKUP)) {
+		offset = XFS_INO_TO_OFFSET(mp, ino);
+		ASSERT(offset < mp->m_sb.sb_inopblock);
+		*bno = XFS_AGB_TO_FSB(mp, agno, agbno);
+		*off = offset;
+		*len = 1;
+		return 0;
+	}
+	blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
+	if (*bno != NULLFSBLOCK) {
+		offset = XFS_INO_TO_OFFSET(mp, ino);
+		ASSERT(offset < mp->m_sb.sb_inopblock);
+		cluster_agbno = XFS_FSB_TO_AGBNO(mp, *bno);
+		*off = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
+			offset;
+		*len = blks_per_cluster;
+		return 0;
+	}
+	if (mp->m_inoalign_mask) {
+		offset_agbno = agbno & mp->m_inoalign_mask;
+		chunk_agbno = agbno - offset_agbno;
+	} else {
+		down_read(&mp->m_peraglock);
+		error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+		up_read(&mp->m_peraglock);
+		if (error) {
+#ifdef DEBUG
+			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+					"xfs_ialloc_read_agi() returned "
+					"error %d, agno %d",
+					error, agno);
+#endif /* DEBUG */
+			return error;
+		}
+		cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
+			(xfs_inode_t *)0, 0);
+		if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
+#ifdef DEBUG
+			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+					"xfs_inobt_lookup_le() failed");
+#endif /* DEBUG */
+			goto error0;
+		}
+		if ((error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
+				&chunk_free, &i, ARCH_NOCONVERT))) {
+#ifdef DEBUG
+			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+					"xfs_inobt_get_rec() failed");
+#endif /* DEBUG */
+			goto error0;
+		}
+		if (i == 0) {
+#ifdef DEBUG
+			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+					"xfs_inobt_get_rec() failed");
+#endif /* DEBUG */
+			error = XFS_ERROR(EINVAL);
+		}
+		xfs_trans_brelse(tp, agbp);
+		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+		if (error)
+			return error;
+		chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino);
+		offset_agbno = agbno - chunk_agbno;
+	}
+	ASSERT(agbno >= chunk_agbno);
+	cluster_agbno = chunk_agbno +
+		((offset_agbno / blks_per_cluster) * blks_per_cluster);
+	offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
+		XFS_INO_TO_OFFSET(mp, ino);
+	*bno = XFS_AGB_TO_FSB(mp, agno, cluster_agbno);
+	*off = offset;
+	*len = blks_per_cluster;
+	return 0;
+error0:
+	xfs_trans_brelse(tp, agbp);
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+/*
+ * Compute and fill in value of m_in_maxlevels.
+ */
+void
+xfs_ialloc_compute_maxlevels(
+	xfs_mount_t	*mp)		/* file system mount structure */
+{
+	int		level;
+	uint		maxblocks;
+	uint		maxleafents;
+	int		minleafrecs;
+	int		minnoderecs;
+
+	maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
+		XFS_INODES_PER_CHUNK_LOG;
+	minleafrecs = mp->m_alloc_mnr[0];
+	minnoderecs = mp->m_alloc_mnr[1];
+	maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+	for (level = 1; maxblocks > 1; level++)
+		maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+	mp->m_in_maxlevels = level;
+}
+
+/*
+ * Log specified fields for the ag hdr (inode section)
+ */
+void
+xfs_ialloc_log_agi(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_buf_t	*bp,		/* allocation group header buffer */
+	int		fields)		/* bitmask of fields to log */
+{
+	int			first;		/* first byte number */
+	int			last;		/* last byte number */
+	static const short	offsets[] = {	/* field starting offsets */
+					/* keep in sync with bit definitions */
+		offsetof(xfs_agi_t, agi_magicnum),
+		offsetof(xfs_agi_t, agi_versionnum),
+		offsetof(xfs_agi_t, agi_seqno),
+		offsetof(xfs_agi_t, agi_length),
+		offsetof(xfs_agi_t, agi_count),
+		offsetof(xfs_agi_t, agi_root),
+		offsetof(xfs_agi_t, agi_level),
+		offsetof(xfs_agi_t, agi_freecount),
+		offsetof(xfs_agi_t, agi_newino),
+		offsetof(xfs_agi_t, agi_dirino),
+		offsetof(xfs_agi_t, agi_unlinked),
+		sizeof(xfs_agi_t)
+	};
+#ifdef DEBUG
+	xfs_agi_t		*agi;	/* allocation group header */
+
+	agi = XFS_BUF_TO_AGI(bp);
+	ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) ==
+		XFS_AGI_MAGIC);
+#endif
+	/*
+	 * Compute byte offsets for the first and last fields.
+	 */
+	xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last);
+	/*
+	 * Log the allocation group inode header buffer.
+	 */
+	xfs_trans_log_buf(tp, bp, first, last);
+}
+
+/*
+ * Read in the allocation group header (inode allocation section)
+ */
+int
+xfs_ialloc_read_agi(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	xfs_buf_t	**bpp)		/* allocation group hdr buf */
+{
+	xfs_agi_t	*agi;		/* allocation group header */
+	int		agi_ok;		/* agi is consistent */
+	xfs_buf_t	*bp;		/* allocation group hdr buf */
+    xfs_daddr_t		d;		/* disk block address */
+	int		error;
+#ifdef DEBUG
+	int		i;
+#endif
+	xfs_perag_t	*pag;		/* per allocation group data */
+
+
+	ASSERT(agno != NULLAGNUMBER);
+	d = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR);
+	if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, 1, 0, &bp)))
+		return error;
+	ASSERT(bp && !XFS_BUF_GETERROR(bp));
+	/*
+	 * Validate the magic number of the agi block.
+	 */
+	agi = XFS_BUF_TO_AGI(bp);
+	agi_ok =
+		INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC &&
+		XFS_AGI_GOOD_VERSION(INT_GET(agi->agi_versionnum, ARCH_CONVERT));
+	if (XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
+			XFS_RANDOM_IALLOC_READ_AGI)) {
+		xfs_trans_brelse(tp, bp);
+#ifdef __KERNEL__	/* additional, temporary, debugging code */
+		cmn_err(CE_NOTE,
+			"EFSCORRUPTED returned from file %s line %d",
+			__FILE__, __LINE__);
+#endif
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	pag = &mp->m_perag[agno];
+	if (!pag->pagi_init) {
+		pag->pagi_freecount = INT_GET(agi->agi_freecount, ARCH_CONVERT);
+		pag->pagi_init = 1;
+	} else {
+		/*
+		 * It's possible for these to be out of sync if
+		 * we are in the middle of a forced shutdown.
+		 */
+		ASSERT(pag->pagi_freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT)
+			|| XFS_FORCED_SHUTDOWN(mp));
+	}
+#ifdef DEBUG
+	for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
+		ASSERT(!INT_ISZERO(agi->agi_unlinked[i], ARCH_CONVERT));
+#endif
+	XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGI, XFS_AGI_REF);
+	*bpp = bp;
+	return 0;
+}
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
new file mode 100644
index 000000000000..bc88dd9c7a77
--- /dev/null
+++ b/fs/xfs/xfs_ialloc.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_IALLOC_H__
+#define __XFS_IALLOC_H__
+
+struct xfs_buf;
+struct xfs_dinode;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * Allocation parameters for inode allocation.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IALLOC_INODES)
+int xfs_ialloc_inodes(struct xfs_mount *mp);
+#define XFS_IALLOC_INODES(mp)	xfs_ialloc_inodes(mp)
+#else
+#define XFS_IALLOC_INODES(mp)	((mp)->m_ialloc_inos)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IALLOC_BLOCKS)
+xfs_extlen_t xfs_ialloc_blocks(struct xfs_mount *mp);
+#define XFS_IALLOC_BLOCKS(mp)	xfs_ialloc_blocks(mp)
+#else
+#define XFS_IALLOC_BLOCKS(mp)	((mp)->m_ialloc_blks)
+#endif
+
+/*
+ * For small block file systems, move inodes in clusters of this size.
+ * When we don't have a lot of memory, however, we go a bit smaller
+ * to reduce the number of AGI and ialloc btree blocks we need to keep
+ * around for xfs_dilocate().  We choose which one to use in
+ * xfs_mount_int().
+ */
+#define XFS_INODE_BIG_CLUSTER_SIZE	8192
+#define XFS_INODE_SMALL_CLUSTER_SIZE	4096
+#define XFS_INODE_CLUSTER_SIZE(mp)	(mp)->m_inode_cluster_size
+
+/*
+ * Make an inode pointer out of the buffer/offset.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MAKE_IPTR)
+struct xfs_dinode *xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o);
+#define XFS_MAKE_IPTR(mp,b,o)		xfs_make_iptr(mp,b,o)
+#else
+#define XFS_MAKE_IPTR(mp,b,o) \
+	((xfs_dinode_t *)(xfs_buf_offset(b, (o) << (mp)->m_sb.sb_inodelog)))
+#endif
+
+/*
+ * Find a free (set) bit in the inode bitmask.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IALLOC_FIND_FREE)
+int xfs_ialloc_find_free(xfs_inofree_t *fp);
+#define XFS_IALLOC_FIND_FREE(fp)	xfs_ialloc_find_free(fp)
+#else
+#define XFS_IALLOC_FIND_FREE(fp)	xfs_lowbit64(*(fp))
+#endif
+
+
+#ifdef __KERNEL__
+
+/*
+ * Prototypes for visible xfs_ialloc.c routines.
+ */
+
+/*
+ * Allocate an inode on disk.
+ * Mode is used to tell whether the new inode will need space, and whether
+ * it is a directory.
+ *
+ * To work within the constraint of one allocation per transaction,
+ * xfs_dialloc() is designed to be called twice if it has to do an
+ * allocation to make more free inodes.	 If an inode is
+ * available without an allocation, agbp would be set to the current
+ * agbp and alloc_done set to false.
+ * If an allocation needed to be done, agbp would be set to the
+ * inode header of the allocation group and alloc_done set to true.
+ * The caller should then commit the current transaction and allocate a new
+ * transaction.	 xfs_dialloc() should then be called again with
+ * the agbp value returned from the previous call.
+ *
+ * Once we successfully pick an inode its number is returned and the
+ * on-disk data structures are updated.	 The inode itself is not read
+ * in, since doing so would break ordering constraints with xfs_reclaim.
+ *
+ * *agbp should be set to NULL on the first call, *alloc_done set to FALSE.
+ */
+int					/* error */
+xfs_dialloc(
+	struct xfs_trans *tp,		/* transaction pointer */
+	xfs_ino_t	parent,		/* parent inode (directory) */
+	mode_t		mode,		/* mode bits for new inode */
+	int		okalloc,	/* ok to allocate more space */
+	struct xfs_buf	**agbp,		/* buf for a.g. inode header */
+	boolean_t	*alloc_done,	/* an allocation was done to replenish
+					   the free inodes */
+	xfs_ino_t	*inop);		/* inode number allocated */
+
+/*
+ * Free disk inode.  Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
+ */
+int					/* error */
+xfs_difree(
+	struct xfs_trans *tp,		/* transaction pointer */
+	xfs_ino_t	inode);		/* inode to be freed */
+
+/*
+ * Return the location of the inode in bno/len/off,
+ * for mapping it into a buffer.
+ */
+int
+xfs_dilocate(
+	struct xfs_mount *mp,		/* file system mount structure */
+	struct xfs_trans *tp,		/* transaction pointer */
+	xfs_ino_t	ino,		/* inode to locate */
+	xfs_fsblock_t	*bno,		/* output: block containing inode */
+	int		*len,		/* output: num blocks in cluster*/
+	int		*off,		/* output: index in block of inode */
+	uint		flags);		/* flags for inode btree lookup */
+
+/*
+ * Compute and fill in value of m_in_maxlevels.
+ */
+void
+xfs_ialloc_compute_maxlevels(
+	struct xfs_mount *mp);		/* file system mount structure */
+
+/*
+ * Log specified fields for the ag hdr (inode section)
+ */
+void
+xfs_ialloc_log_agi(
+	struct xfs_trans *tp,		/* transaction pointer */
+	struct xfs_buf	*bp,		/* allocation group header buffer */
+	int		fields);	/* bitmask of fields to log */
+
+/*
+ * Read in the allocation group header (inode allocation section)
+ */
+int					/* error */
+xfs_ialloc_read_agi(
+	struct xfs_mount *mp,		/* file system mount structure */
+	struct xfs_trans *tp,		/* transaction pointer */
+	xfs_agnumber_t	agno,		/* allocation group number */
+	struct xfs_buf	**bpp);		/* allocation group hdr buf */
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
new file mode 100644
index 000000000000..0feaf85a5455
--- /dev/null
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -0,0 +1,2104 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * Inode allocation management for XFS.
+ */
+
+#include <xfs.h>
+
+
+/*
+ * Prototypes for internal functions.
+ */
+
+STATIC void xfs_inobt_log_block(xfs_trans_t *, xfs_buf_t *, int);
+STATIC void xfs_inobt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC void xfs_inobt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC void xfs_inobt_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+STATIC int xfs_inobt_lshift(xfs_btree_cur_t *, int, int *);
+STATIC int xfs_inobt_newroot(xfs_btree_cur_t *, int *);
+STATIC int xfs_inobt_rshift(xfs_btree_cur_t *, int, int *);
+STATIC int xfs_inobt_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
+		xfs_inobt_key_t *, xfs_btree_cur_t **, int *);
+STATIC int xfs_inobt_updkey(xfs_btree_cur_t *, xfs_inobt_key_t *, int);
+
+/*
+ * Internal functions.
+ */
+
+#ifdef _NOTYET_
+/*
+ * Single level of the xfs_inobt_delete record deletion routine.
+ * Delete record pointed to by cur/level.
+ * Remove the record from its block then rebalance the tree.
+ * Return 0 for error, 1 for done, 2 to go on to the next level.
+ */
+STATIC int				/* error */
+xfs_inobt_delrec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level removing record from */
+	int			*stat)	/* fail/done/go-on */
+{
+	xfs_buf_t		*agbp;	/* buffer for a.g. inode header */
+	xfs_agnumber_t		agfbno; /* agf block of freed btree block */
+	xfs_buf_t		*agfbp; /* bp of agf block of freed block */
+	xfs_agi_t		*agi;	/* allocation group inode header */
+	xfs_inobt_block_t	*block; /* btree block record/key lives in */
+	xfs_agblock_t		bno;	/* btree block number */
+	xfs_buf_t		*bp;	/* buffer for block */
+	int			error;	/* error return value */
+	int			i;	/* loop index */
+	xfs_inobt_key_t		key;	/* kp points here if block is level 0 */
+	xfs_inobt_key_t		*kp;	/* pointer to btree keys */
+	xfs_agblock_t		lbno;	/* left block's block number */
+	xfs_buf_t		*lbp;	/* left block's buffer pointer */
+	xfs_inobt_block_t	*left;	/* left btree block */
+	xfs_inobt_key_t		*lkp;	/* left block key pointer */
+	xfs_inobt_ptr_t		*lpp;	/* left block address pointer */
+	int			lrecs;	/* number of records in left block */
+	xfs_inobt_rec_t		*lrp;	/* left block record pointer */
+	xfs_inobt_ptr_t		*pp;	/* pointer to btree addresses */
+	int			ptr;	/* index in btree block for this rec */
+	xfs_agblock_t		rbno;	/* right block's block number */
+	xfs_buf_t		*rbp;	/* right block's buffer pointer */
+	xfs_inobt_block_t	*right; /* right btree block */
+	xfs_inobt_key_t		*rkp;	/* right block key pointer */
+	xfs_inobt_rec_t		*rp;	/* pointer to btree records */
+	xfs_inobt_ptr_t		*rpp;	/* right block address pointer */
+	int			rrecs;	/* number of records in right block */
+	xfs_inobt_rec_t		*rrp;	/* right block record pointer */
+	xfs_btree_cur_t		*tcur;	/* temporary btree cursor */
+
+
+	/*
+	 * Get the index of the entry being deleted, check for nothing there.
+	 */
+	ptr = cur->bc_ptrs[level];
+	if (ptr == 0) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * Get the buffer & block containing the record or key/ptr.
+	 */
+	bp = cur->bc_bufs[level];
+	block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+	if (error = xfs_btree_check_sblock(cur, block, level, bp))
+		return error;
+#endif
+	/*
+	 * Fail if we're off the end of the block.
+	 */
+	if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * It's a nonleaf.  Excise the key and ptr being deleted, by
+	 * sliding the entries past them down one.
+	 * Log the changed areas of the block.
+	 */
+	if (level > 0) {
+		kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
+		pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
+#ifdef DEBUG
+		for (i = ptr; i < INT_GET(block->bb_numrecs, ARCH_CONVERT); i++) {
+			if (error = xfs_btree_check_sptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))
+				return error;
+		}
+#endif
+		if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+			ovbcopy(&kp[ptr], &kp[ptr - 1],
+				(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*kp));
+			ovbcopy(&pp[ptr], &pp[ptr - 1],
+				(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*pp));
+			xfs_inobt_log_keys(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1);
+			xfs_inobt_log_ptrs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1);
+		}
+	}
+	/*
+	 * It's a leaf.	 Excise the record being deleted, by sliding the
+	 * entries past it down one.  Log the changed areas of the block.
+	 */
+	else {
+		rp = XFS_INOBT_REC_ADDR(block, 1, cur);
+		if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+			ovbcopy(&rp[ptr], &rp[ptr - 1],
+				(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*rp));
+			xfs_inobt_log_recs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1);
+		}
+		/*
+		 * If it's the first record in the block, we'll need a key
+		 * structure to pass up to the next level (updkey).
+		 */
+		if (ptr == 1) {
+			INT_COPY(key.ir_startino, rp->ir_startino, ARCH_CONVERT);
+			kp = &key;
+		}
+	}
+	/*
+	 * Decrement and log the number of entries in the block.
+	 */
+	INT_MOD(block->bb_numrecs, ARCH_CONVERT, -1);
+	xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
+	/*
+	 * Is this the root level?  If so, we're almost done.
+	 */
+	if (level == cur->bc_nlevels - 1) {
+		/*
+		 * If this is the root level,
+		 * and there's only one entry left,
+		 * and it's NOT the leaf level,
+		 * then we can get rid of this level.
+		 */
+		if (INT_GET(block->bb_numrecs, ARCH_CONVERT) == 1 && level > 0) {
+			agbp = cur->bc_private.i.agbp;
+			agi = XFS_BUF_TO_AGI(agbp);
+			/*
+			 * pp is still set to the first pointer in the block.
+			 * Make it the new root of the btree.
+			 */
+			bno = INT_GET(agi->agi_root, ARCH_CONVERT);
+			INT_COPY(agi->agi_root, *pp, ARCH_CONVERT);
+			INT_MOD(agi->agi_level, ARCH_CONVERT, -1);
+			/*
+			 * Free the block.
+			 */
+			if (error = xfs_free_extent(cur->bc_tp, bno, 1))
+				return error;
+			xfs_trans_binval(cur->bc_tp, bp);
+			xfs_ialloc_log_agi(cur->bc_tp, agbp,
+				XFS_AGI_ROOT | XFS_AGI_LEVEL);
+			/*
+			 * Update the cursor so there's one fewer level.
+			 */
+			cur->bc_bufs[level] = NULL;
+			cur->bc_nlevels--;
+			/*
+			 * To ensure that the freed block is not used for
+			 * user data until this transaction is permanent,
+			 * we lock the agf buffer for this ag until the
+			 * transaction record makes it to the on-disk log.
+			 */
+			agfbno = XFS_AG_DADDR(cur->bc_mp,
+					      cur->bc_private.i.agno,
+					      XFS_AGF_DADDR);
+			if (error = xfs_trans_read_buf(cur->bc_mp, cur->bc_tp,
+					cur->bc_mp->m_ddev_targp, agfbno, 1, 0,
+					&agfbp))
+				return error;
+			ASSERT(!XFS_BUF_GETERROR(agfbp));
+			xfs_trans_bhold_until_committed(cur->bc_tp, agfbp);
+		} else if (level > 0 &&
+			   (error = xfs_inobt_decrement(cur, level, &i)))
+			return error;
+		*stat = 1;
+		return 0;
+	}
+	/*
+	 * If we deleted the leftmost entry in the block, update the
+	 * key values above us in the tree.
+	 */
+	if (ptr == 1 && (error = xfs_inobt_updkey(cur, kp, level + 1)))
+		return error;
+	/*
+	 * If the number of records remaining in the block is at least
+	 * the minimum, we're done.
+	 */
+	if (INT_GET(block->bb_numrecs, ARCH_CONVERT) >= XFS_INOBT_BLOCK_MINRECS(level, cur)) {
+		if (level > 0 &&
+		    (error = xfs_inobt_decrement(cur, level, &i)))
+			return error;
+		*stat = 1;
+		return 0;
+	}
+	/*
+	 * Otherwise, we have to move some records around to keep the
+	 * tree balanced.  Look at the left and right sibling blocks to
+	 * see if we can re-balance by moving only one record.
+	 */
+	rbno = INT_GET(block->bb_rightsib, ARCH_CONVERT);
+	lbno = INT_GET(block->bb_leftsib, ARCH_CONVERT);
+	bno = NULLAGBLOCK;
+	ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
+	/*
+	 * Duplicate the cursor so our btree manipulations here won't
+	 * disrupt the next level up.
+	 */
+	if (error = xfs_btree_dup_cursor(cur, &tcur))
+		return error;
+	/*
+	 * If there's a right sibling, see if it's ok to shift an entry
+	 * out of it.
+	 */
+	if (rbno != NULLAGBLOCK) {
+		/*
+		 * Move the temp cursor to the last entry in the next block.
+		 * Actually any entry but the first would suffice.
+		 */
+		i = xfs_btree_lastrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		if (error = xfs_inobt_increment(tcur, level, &i))
+			goto error0;
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		i = xfs_btree_lastrec(tcur, level);
+		XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+		/*
+		 * Grab a pointer to the block.
+		 */
+		rbp = tcur->bc_bufs[level];
+		right = XFS_BUF_TO_INOBT_BLOCK(rbp);
+#ifdef DEBUG
+		if (error = xfs_btree_check_sblock(cur, right, level, rbp))
+			goto error0;
+#endif
+		/*
+		 * Grab the current block number, for future use.
+		 */
+		bno = INT_GET(right->bb_leftsib, ARCH_CONVERT);
+		/*
+		 * If right block is full enough so that removing one entry
+		 * won't make it too empty, and left-shifting an entry out
+		 * of right to us works, we're done.
+		 */
+		if (INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1 >=
+		     XFS_INOBT_BLOCK_MINRECS(level, cur)) {
+			if (error = xfs_inobt_lshift(tcur, level, &i))
+				goto error0;
+			if (i) {
+				ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
+				       XFS_INOBT_BLOCK_MINRECS(level, cur));
+				xfs_btree_del_cursor(tcur,
+						     XFS_BTREE_NOERROR);
+				if (level > 0 &&
+				    (error = xfs_inobt_decrement(cur, level,
+						&i)))
+					return error;
+				*stat = 1;
+				return 0;
+			}
+		}
+		/*
+		 * Otherwise, grab the number of records in right for
+		 * future reference, and fix up the temp cursor to point
+		 * to our block again (last record).
+		 */
+		rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT);
+		if (lbno != NULLAGBLOCK) {
+			xfs_btree_firstrec(tcur, level);
+			if (error = xfs_inobt_decrement(tcur, level, &i))
+				goto error0;
+		}
+	}
+	/*
+	 * If there's a left sibling, see if it's ok to shift an entry
+	 * out of it.
+	 */
+	if (lbno != NULLAGBLOCK) {
+		/*
+		 * Move the temp cursor to the first entry in the
+		 * previous block.
+		 */
+		xfs_btree_firstrec(tcur, level);
+		if (error = xfs_inobt_decrement(tcur, level, &i))
+			goto error0;
+		xfs_btree_firstrec(tcur, level);
+		/*
+		 * Grab a pointer to the block.
+		 */
+		lbp = tcur->bc_bufs[level];
+		left = XFS_BUF_TO_INOBT_BLOCK(lbp);
+#ifdef DEBUG
+		if (error = xfs_btree_check_sblock(cur, left, level, lbp))
+			goto error0;
+#endif
+		/*
+		 * Grab the current block number, for future use.
+		 */
+		bno = INT_GET(left->bb_rightsib, ARCH_CONVERT);
+		/*
+		 * If left block is full enough so that removing one entry
+		 * won't make it too empty, and right-shifting an entry out
+		 * of left to us works, we're done.
+		 */
+		if (INT_GET(left->bb_numrecs, ARCH_CONVERT) - 1 >=
+		     XFS_INOBT_BLOCK_MINRECS(level, cur)) {
+			if (error = xfs_inobt_rshift(tcur, level, &i))
+				goto error0;
+			if (i) {
+				ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >=
+				       XFS_INOBT_BLOCK_MINRECS(level, cur));
+				xfs_btree_del_cursor(tcur,
+						     XFS_BTREE_NOERROR);
+				if (level == 0)
+					cur->bc_ptrs[0]++;
+				*stat = 1;
+				return 0;
+			}
+		}
+		/*
+		 * Otherwise, grab the number of records in right for
+		 * future reference.
+		 */
+		lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT);
+	}
+	/*
+	 * Delete the temp cursor, we're done with it.
+	 */
+	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+	/*
+	 * If here, we need to do a join to keep the tree balanced.
+	 */
+	ASSERT(bno != NULLAGBLOCK);
+	/*
+	 * See if we can join with the left neighbor block.
+	 */
+	if (lbno != NULLAGBLOCK &&
+	    lrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
+		/*
+		 * Set "right" to be the starting block,
+		 * "left" to be the left neighbor.
+		 */
+		rbno = bno;
+		right = block;
+		rbp = bp;
+		if (error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+				cur->bc_private.i.agno, lbno, 0, &lbp,
+				XFS_INO_BTREE_REF))
+			return error;
+		left = XFS_BUF_TO_INOBT_BLOCK(lbp);
+		if (error = xfs_btree_check_sblock(cur, left, level, lbp))
+			return error;
+	}
+	/*
+	 * If that won't work, see if we can join with the right neighbor block.
+	 */
+	else if (rbno != NULLAGBLOCK &&
+		 rrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <=
+		  XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
+		/*
+		 * Set "left" to be the starting block,
+		 * "right" to be the right neighbor.
+		 */
+		lbno = bno;
+		left = block;
+		lbp = bp;
+		if (error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+				cur->bc_private.i.agno, rbno, 0, &rbp,
+				XFS_INO_BTREE_REF))
+			return error;
+		right = XFS_BUF_TO_INOBT_BLOCK(rbp);
+		if (error = xfs_btree_check_sblock(cur, right, level, rbp))
+			return error;
+	}
+	/*
+	 * Otherwise, we can't fix the imbalance.
+	 * Just return.	 This is probably a logic error, but it's not fatal.
+	 */
+	else {
+		if (level > 0 && (error = xfs_inobt_decrement(cur, level, &i)))
+			return error;
+		*stat = 1;
+		return 0;
+	}
+	/*
+	 * We're now going to join "left" and "right" by moving all the stuff
+	 * in "right" to "left" and deleting "right".
+	 */
+	if (level > 0) {
+		/*
+		 * It's a non-leaf.  Move keys and pointers.
+		 */
+		lkp = XFS_INOBT_KEY_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur);
+		lpp = XFS_INOBT_PTR_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur);
+		rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
+		rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+		for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
+			if (error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))
+				return error;
+		}
+#endif
+		bcopy(rkp, lkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lkp));
+		bcopy(rpp, lpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lpp));
+		xfs_inobt_log_keys(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1,
+				   INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		xfs_inobt_log_ptrs(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1,
+				   INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT));
+	} else {
+		/*
+		 * It's a leaf.	 Move records.
+		 */
+		lrp = XFS_INOBT_REC_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur);
+		rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
+		bcopy(rrp, lrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lrp));
+		xfs_inobt_log_recs(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1,
+				   INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT));
+	}
+	/*
+	 * Fix up the number of records in the surviving block.
+	 */
+	INT_MOD(left->bb_numrecs, ARCH_CONVERT, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+	/*
+	 * Fix up the right block pointer in the surviving block, and log it.
+	 */
+	INT_COPY(left->bb_rightsib, right->bb_rightsib, ARCH_CONVERT);
+	xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+	/*
+	 * If there is a right sibling now, make it point to the
+	 * remaining block.
+	 */
+	if (INT_GET(left->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+		xfs_inobt_block_t	*rrblock;
+		xfs_buf_t		*rrbp;
+
+		if (error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+				cur->bc_private.i.agno, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0,
+				&rrbp, XFS_INO_BTREE_REF))
+			return error;
+		rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
+		if (error = xfs_btree_check_sblock(cur, rrblock, level, rrbp))
+			return error;
+		INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, lbno);
+		xfs_inobt_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
+	}
+	/*
+	 * Free the deleting block.
+	 */
+	if (error = xfs_free_extent(cur->bc_tp, rbno, 1))
+		return error;
+	xfs_trans_binval(cur->bc_tp, rbp);
+	/*
+	 * To ensure that the freed block is not used for
+	 * user data until this transaction is permanent,
+	 * we lock the agf buffer for this ag until the
+	 * transaction record makes it to the on-disk log.
+	 */
+	agfbno = XFS_AG_DADDR(cur->bc_mp, cur->bc_private.i.agno,
+			      XFS_AGF_DADDR);
+	if (error = xfs_trans_read_buf(cur->bc_mp, cur->bc_tp,
+			cur->bc_mp->m_ddev_targp, agfbno, 1, 0, &agfbp))
+		return error;
+	ASSERT(!XFS_BUF_GETERROR(agfbp));
+	xfs_trans_bhold_until_committed(cur->bc_tp, agfbp);
+	/*
+	 * If we joined with the left neighbor, set the buffer in the
+	 * cursor to the left block, and fix up the index.
+	 */
+	if (bp != lbp) {
+		cur->bc_bufs[level] = lbp;
+		cur->bc_ptrs[level] += INT_GET(left->bb_numrecs, ARCH_CONVERT);
+		cur->bc_ra[level] = 0;
+	}
+	/*
+	 * If we joined with the right neighbor and there's a level above
+	 * us, increment the cursor at that level.
+	 */
+	else if (level + 1 < cur->bc_nlevels &&
+		 (error = xfs_inobt_increment(cur, level + 1, &i))) {
+		return error;
+	}
+	/*
+	 * Readjust the ptr at this level if it's not a leaf, since it's
+	 * still pointing at the deletion point, which makes the cursor
+	 * inconsistent.  If this makes the ptr 0, the caller fixes it up.
+	 * We can't use decrement because it would change the next level up.
+	 */
+	if (level > 0)
+		cur->bc_ptrs[level]--;
+	/*
+	 * Return value means the next level up has something to do.
+	 */
+	*stat = 2;
+	return 0;
+
+error0:
+	xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+	return error;
+}
+#endif	/* _NOTYET_ */
+
+/*
+ * Insert one record/level.  Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int				/* error */
+xfs_inobt_insrec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level to insert record at */
+	xfs_agblock_t		*bnop,	/* i/o: block number inserted */
+	xfs_inobt_rec_t		*recp,	/* i/o: record data inserted */
+	xfs_btree_cur_t		**curp, /* output: new cursor replacing cur */
+	int			*stat)	/* success/failure */
+{
+	xfs_inobt_block_t	*block; /* btree block record/key lives in */
+	xfs_buf_t		*bp;	/* buffer for block */
+	int			error;	/* error return value */
+	int			i;	/* loop index */
+	xfs_inobt_key_t		key;	/* key value being inserted */
+	xfs_inobt_key_t		*kp=NULL;	/* pointer to btree keys */
+	xfs_agblock_t		nbno;	/* block number of allocated block */
+	xfs_btree_cur_t		*ncur;	/* new cursor to be used at next lvl */
+	xfs_inobt_key_t		nkey;	/* new key value, from split */
+	xfs_inobt_rec_t		nrec;	/* new record value, for caller */
+	int			optr;	/* old ptr value */
+	xfs_inobt_ptr_t		*pp;	/* pointer to btree addresses */
+	int			ptr;	/* index in btree block for this rec */
+	xfs_inobt_rec_t		*rp=NULL;	/* pointer to btree records */
+
+	/*
+	 * If we made it to the root level, allocate a new root block
+	 * and we're done.
+	 */
+	if (level >= cur->bc_nlevels) {
+		error = xfs_inobt_newroot(cur, &i);
+		*bnop = NULLAGBLOCK;
+		*stat = i;
+		return error;
+	}
+	/*
+	 * Make a key out of the record data to be inserted, and save it.
+	 */
+	key.ir_startino = recp->ir_startino; /* INT_: direct copy */
+	optr = ptr = cur->bc_ptrs[level];
+	/*
+	 * If we're off the left edge, return failure.
+	 */
+	if (ptr == 0) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * Get pointers to the btree buffer and block.
+	 */
+	bp = cur->bc_bufs[level];
+	block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+		return error;
+	/*
+	 * Check that the new entry is being inserted in the right place.
+	 */
+	if (ptr <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+		if (level == 0) {
+			rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
+			xfs_btree_check_rec(cur->bc_btnum, recp, rp);
+		} else {
+			kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
+			xfs_btree_check_key(cur->bc_btnum, &key, kp);
+		}
+	}
+#endif
+	nbno = NULLAGBLOCK;
+	ncur = (xfs_btree_cur_t *)0;
+	/*
+	 * If the block is full, we can't insert the new entry until we
+	 * make the block un-full.
+	 */
+	if (INT_GET(block->bb_numrecs, ARCH_CONVERT) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
+		/*
+		 * First, try shifting an entry to the right neighbor.
+		 */
+		if ((error = xfs_inobt_rshift(cur, level, &i)))
+			return error;
+		if (i) {
+			/* nothing */
+		}
+		/*
+		 * Next, try shifting an entry to the left neighbor.
+		 */
+		else {
+			if ((error = xfs_inobt_lshift(cur, level, &i)))
+				return error;
+			if (i) {
+				optr = ptr = cur->bc_ptrs[level];
+			} else {
+				/*
+				 * Next, try splitting the current block
+				 * in half. If this works we have to
+				 * re-set our variables because
+				 * we could be in a different block now.
+				 */
+				if ((error = xfs_inobt_split(cur, level, &nbno,
+						&nkey, &ncur, &i)))
+					return error;
+				if (i) {
+					bp = cur->bc_bufs[level];
+					block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+					if ((error = xfs_btree_check_sblock(cur,
+							block, level, bp)))
+						return error;
+#endif
+					ptr = cur->bc_ptrs[level];
+					nrec.ir_startino = nkey.ir_startino; /* INT_: direct copy */
+				} else {
+					/*
+					 * Otherwise the insert fails.
+					 */
+					*stat = 0;
+					return 0;
+				}
+			}
+		}
+	}
+	/*
+	 * At this point we know there's room for our new entry in the block
+	 * we're pointing at.
+	 */
+	if (level > 0) {
+		/*
+		 * It's a non-leaf entry.  Make a hole for the new data
+		 * in the key and ptr regions of the block.
+		 */
+		kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
+		pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
+#ifdef DEBUG
+		for (i = INT_GET(block->bb_numrecs, ARCH_CONVERT); i >= ptr; i--) {
+			if ((error = xfs_btree_check_sptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT), level)))
+				return error;
+		}
+#endif
+		ovbcopy(&kp[ptr - 1], &kp[ptr],
+			(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*kp));
+		ovbcopy(&pp[ptr - 1], &pp[ptr],
+			(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*pp));
+		/*
+		 * Now stuff the new data in, bump numrecs and log the new data.
+		 */
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
+			return error;
+#endif
+		kp[ptr - 1] = key; /* INT_: struct copy */
+		INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop);
+		INT_MOD(block->bb_numrecs, ARCH_CONVERT, +1);
+		xfs_inobt_log_keys(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT));
+		xfs_inobt_log_ptrs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT));
+	} else {
+		/*
+		 * It's a leaf entry.  Make a hole for the new record.
+		 */
+		rp = XFS_INOBT_REC_ADDR(block, 1, cur);
+		ovbcopy(&rp[ptr - 1], &rp[ptr],
+			(INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*rp));
+		/*
+		 * Now stuff the new record in, bump numrecs
+		 * and log the new data.
+		 */
+		rp[ptr - 1] = *recp; /* INT_: struct copy */
+		INT_MOD(block->bb_numrecs, ARCH_CONVERT, +1);
+		xfs_inobt_log_recs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT));
+	}
+	/*
+	 * Log the new number of records in the btree header.
+	 */
+	xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
+#ifdef DEBUG
+	/*
+	 * Check that the key/record is in the right place, now.
+	 */
+	if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+		if (level == 0)
+			xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
+				rp + ptr);
+		else
+			xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
+				kp + ptr);
+	}
+#endif
+	/*
+	 * If we inserted at the start of a block, update the parents' keys.
+	 */
+	if (optr == 1 && (error = xfs_inobt_updkey(cur, &key, level + 1)))
+		return error;
+	/*
+	 * Return the new block number, if any.
+	 * If there is one, give back a record value and a cursor too.
+	 */
+	*bnop = nbno;
+	if (nbno != NULLAGBLOCK) {
+		*recp = nrec; /* INT_: struct copy */
+		*curp = ncur;
+	}
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Log header fields from a btree block.
+ */
+STATIC void
+xfs_inobt_log_block(
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_buf_t		*bp,	/* buffer containing btree block */
+	int			fields) /* mask of fields: XFS_BB_... */
+{
+	int			first;	/* first byte offset logged */
+	int			last;	/* last byte offset logged */
+	static const short	offsets[] = {	/* table of offsets */
+		offsetof(xfs_inobt_block_t, bb_magic),
+		offsetof(xfs_inobt_block_t, bb_level),
+		offsetof(xfs_inobt_block_t, bb_numrecs),
+		offsetof(xfs_inobt_block_t, bb_leftsib),
+		offsetof(xfs_inobt_block_t, bb_rightsib),
+		sizeof(xfs_inobt_block_t)
+	};
+
+	xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last);
+	xfs_trans_log_buf(tp, bp, first, last);
+}
+
+/*
+ * Log keys from a btree block (nonleaf).
+ */
+STATIC void
+xfs_inobt_log_keys(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_buf_t		*bp,	/* buffer containing btree block */
+	int			kfirst, /* index of first key to log */
+	int			klast)	/* index of last key to log */
+{
+	xfs_inobt_block_t	*block; /* btree block to log from */
+	int			first;	/* first byte offset logged */
+	xfs_inobt_key_t		*kp;	/* key pointer in btree block */
+	int			last;	/* last byte offset logged */
+
+	block = XFS_BUF_TO_INOBT_BLOCK(bp);
+	kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
+	first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
+	last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
+	xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+}
+
+/*
+ * Log block pointer fields from a btree block (nonleaf).
+ */
+STATIC void
+xfs_inobt_log_ptrs(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_buf_t		*bp,	/* buffer containing btree block */
+	int			pfirst, /* index of first pointer to log */
+	int			plast)	/* index of last pointer to log */
+{
+	xfs_inobt_block_t	*block; /* btree block to log from */
+	int			first;	/* first byte offset logged */
+	int			last;	/* last byte offset logged */
+	xfs_inobt_ptr_t		*pp;	/* block-pointer pointer in btree blk */
+
+	block = XFS_BUF_TO_INOBT_BLOCK(bp);
+	pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
+	first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
+	last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
+	xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+}
+
+/*
+ * Log records from a btree block (leaf).
+ */
+STATIC void
+xfs_inobt_log_recs(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_buf_t		*bp,	/* buffer containing btree block */
+	int			rfirst, /* index of first record to log */
+	int			rlast)	/* index of last record to log */
+{
+	xfs_inobt_block_t	*block; /* btree block to log from */
+	int			first;	/* first byte offset logged */
+	int			last;	/* last byte offset logged */
+	xfs_inobt_rec_t		*rp;	/* record pointer for btree block */
+
+	block = XFS_BUF_TO_INOBT_BLOCK(bp);
+	rp = XFS_INOBT_REC_ADDR(block, 1, cur);
+	first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
+	last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
+	xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+}
+
+/*
+ * Lookup the record.  The cursor is made to point to it, based on dir.
+ * Return 0 if can't find any such record, 1 for success.
+ */
+STATIC int				/* error */
+xfs_inobt_lookup(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_lookup_t		dir,	/* <=, ==, or >= */
+	int			*stat)	/* success/failure */
+{
+	xfs_agblock_t		agbno;	/* a.g. relative btree block number */
+	xfs_agnumber_t		agno;	/* allocation group number */
+	xfs_inobt_block_t	*block=NULL;	/* current btree block */
+	int			diff;	/* difference for the current key */
+	int			error;	/* error return value */
+	int			keyno=0;	/* current key number */
+	int			level;	/* level in the btree */
+	xfs_mount_t		*mp;	/* file system mount point */
+
+	/*
+	 * Get the allocation group header, and the root block number.
+	 */
+	mp = cur->bc_mp;
+	{
+		xfs_agi_t	*agi;	/* a.g. inode header */
+
+		agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp);
+		agno = INT_GET(agi->agi_seqno, ARCH_CONVERT);
+		agbno = INT_GET(agi->agi_root, ARCH_CONVERT);
+	}
+	/*
+	 * Iterate over each level in the btree, starting at the root.
+	 * For each level above the leaves, find the key we need, based
+	 * on the lookup record, then follow the corresponding block
+	 * pointer down to the next level.
+	 */
+	for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+		xfs_buf_t	*bp;	/* buffer pointer for btree block */
+		xfs_daddr_t		d;	/* disk address of btree block */
+
+		/*
+		 * Get the disk address we're looking for.
+		 */
+		d = XFS_AGB_TO_DADDR(mp, agno, agbno);
+		/*
+		 * If the old buffer at this level is for a different block,
+		 * throw it away, otherwise just use it.
+		 */
+		bp = cur->bc_bufs[level];
+		if (bp && XFS_BUF_ADDR(bp) != d)
+			bp = (xfs_buf_t *)0;
+		if (!bp) {
+			/*
+			 * Need to get a new buffer.  Read it, then
+			 * set it in the cursor, releasing the old one.
+			 */
+			if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+					agno, agbno, 0, &bp, XFS_INO_BTREE_REF)))
+				return error;
+			xfs_btree_setbuf(cur, level, bp);
+			/*
+			 * Point to the btree block, now that we have the buffer
+			 */
+			block = XFS_BUF_TO_INOBT_BLOCK(bp);
+			if ((error = xfs_btree_check_sblock(cur, block, level,
+					bp)))
+				return error;
+		} else
+			block = XFS_BUF_TO_INOBT_BLOCK(bp);
+		/*
+		 * If we already had a key match at a higher level, we know
+		 * we need to use the first entry in this block.
+		 */
+		if (diff == 0)
+			keyno = 1;
+		/*
+		 * Otherwise we need to search this block.  Do a binary search.
+		 */
+		else {
+			int		high;	/* high entry number */
+			xfs_inobt_key_t *kkbase=NULL;/* base of keys in block */
+			xfs_inobt_rec_t *krbase=NULL;/* base of records in block */
+			int		low;	/* low entry number */
+
+			/*
+			 * Get a pointer to keys or records.
+			 */
+			if (level > 0)
+				kkbase = XFS_INOBT_KEY_ADDR(block, 1, cur);
+			else
+				krbase = XFS_INOBT_REC_ADDR(block, 1, cur);
+			/*
+			 * Set low and high entry numbers, 1-based.
+			 */
+			low = 1;
+			if (!(high = INT_GET(block->bb_numrecs, ARCH_CONVERT))) {
+				/*
+				 * If the block is empty, the tree must
+				 * be an empty leaf.
+				 */
+				ASSERT(level == 0 && cur->bc_nlevels == 1);
+				cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+				*stat = 0;
+				return 0;
+			}
+			/*
+			 * Binary search the block.
+			 */
+			while (low <= high) {
+				xfs_agino_t	startino;	/* key value */
+
+				/*
+				 * keyno is average of low and high.
+				 */
+				keyno = (low + high) >> 1;
+				/*
+				 * Get startino.
+				 */
+				if (level > 0) {
+					xfs_inobt_key_t *kkp;
+
+					kkp = kkbase + keyno - 1;
+					startino = INT_GET(kkp->ir_startino, ARCH_CONVERT);
+				} else {
+					xfs_inobt_rec_t *krp;
+
+					krp = krbase + keyno - 1;
+					startino = INT_GET(krp->ir_startino, ARCH_CONVERT);
+				}
+				/*
+				 * Compute difference to get next direction.
+				 */
+				diff = (int)startino - cur->bc_rec.i.ir_startino;
+				/*
+				 * Less than, move right.
+				 */
+				if (diff < 0)
+					low = keyno + 1;
+				/*
+				 * Greater than, move left.
+				 */
+				else if (diff > 0)
+					high = keyno - 1;
+				/*
+				 * Equal, we're done.
+				 */
+				else
+					break;
+			}
+		}
+		/*
+		 * If there are more levels, set up for the next level
+		 * by getting the block number and filling in the cursor.
+		 */
+		if (level > 0) {
+			/*
+			 * If we moved left, need the previous key number,
+			 * unless there isn't one.
+			 */
+			if (diff > 0 && --keyno < 1)
+				keyno = 1;
+			agbno = INT_GET(*XFS_INOBT_PTR_ADDR(block, keyno, cur), ARCH_CONVERT);
+#ifdef DEBUG
+			if ((error = xfs_btree_check_sptr(cur, agbno, level)))
+				return error;
+#endif
+			cur->bc_ptrs[level] = keyno;
+		}
+	}
+	/*
+	 * Done with the search.
+	 * See if we need to adjust the results.
+	 */
+	if (dir != XFS_LOOKUP_LE && diff < 0) {
+		keyno++;
+		/*
+		 * If ge search and we went off the end of the block, but it's
+		 * not the last block, we're in the wrong block.
+		 */
+		if (dir == XFS_LOOKUP_GE &&
+		    keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT) &&
+		    INT_GET(block->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+			int	i;
+
+			cur->bc_ptrs[0] = keyno;
+			if ((error = xfs_inobt_increment(cur, 0, &i)))
+				return error;
+			ASSERT(i == 1);
+			*stat = 1;
+			return 0;
+		}
+	}
+	else if (dir == XFS_LOOKUP_LE && diff > 0)
+		keyno--;
+	cur->bc_ptrs[0] = keyno;
+	/*
+	 * Return if we succeeded or not.
+	 */
+	if (keyno == 0 || keyno > INT_GET(block->bb_numrecs, ARCH_CONVERT))
+		*stat = 0;
+	else
+		*stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
+	return 0;
+}
+
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int				/* error */
+xfs_inobt_lshift(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level to shift record on */
+	int			*stat)	/* success/failure */
+{
+	int			error;	/* error return value */
+#ifdef DEBUG
+	int			i;	/* loop index */
+#endif
+	xfs_inobt_key_t		key;	/* key value for leaf level upward */
+	xfs_buf_t		*lbp;	/* buffer for left neighbor block */
+	xfs_inobt_block_t	*left;	/* left neighbor btree block */
+	xfs_inobt_key_t		*lkp=NULL;	/* key pointer for left block */
+	xfs_inobt_ptr_t		*lpp;	/* address pointer for left block */
+	xfs_inobt_rec_t		*lrp=NULL;	/* record pointer for left block */
+	int			nrec;	/* new number of left block entries */
+	xfs_buf_t		*rbp;	/* buffer for right (current) block */
+	xfs_inobt_block_t	*right; /* right (current) btree block */
+	xfs_inobt_key_t		*rkp=NULL;	/* key pointer for right block */
+	xfs_inobt_ptr_t		*rpp=NULL;	/* address pointer for right block */
+	xfs_inobt_rec_t		*rrp=NULL;	/* record pointer for right block */
+
+	/*
+	 * Set up variables for this block as "right".
+	 */
+	rbp = cur->bc_bufs[level];
+	right = XFS_BUF_TO_INOBT_BLOCK(rbp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
+		return error;
+#endif
+	/*
+	 * If we've got no left sibling then we can't shift an entry left.
+	 */
+	if (INT_GET(right->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * If the cursor entry is the one that would be moved, don't
+	 * do it... it's too complicated.
+	 */
+	if (cur->bc_ptrs[level] <= 1) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * Set up the left neighbor as "left".
+	 */
+	if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+			cur->bc_private.i.agno, INT_GET(right->bb_leftsib, ARCH_CONVERT), 0, &lbp,
+			XFS_INO_BTREE_REF)))
+		return error;
+	left = XFS_BUF_TO_INOBT_BLOCK(lbp);
+	if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+		return error;
+	/*
+	 * If it's full, it can't take another entry.
+	 */
+	if (INT_GET(left->bb_numrecs, ARCH_CONVERT) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
+		*stat = 0;
+		return 0;
+	}
+	nrec = INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1;
+	/*
+	 * If non-leaf, copy a key and a ptr to the left block.
+	 */
+	if (level > 0) {
+		lkp = XFS_INOBT_KEY_ADDR(left, nrec, cur);
+		rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
+		*lkp = *rkp;
+		xfs_inobt_log_keys(cur, lbp, nrec, nrec);
+		lpp = XFS_INOBT_PTR_ADDR(left, nrec, cur);
+		rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sptr(cur, INT_GET(*rpp, ARCH_CONVERT), level)))
+			return error;
+#endif
+		*lpp = *rpp; /* INT_: no-change copy */
+		xfs_inobt_log_ptrs(cur, lbp, nrec, nrec);
+	}
+	/*
+	 * If leaf, copy a record to the left block.
+	 */
+	else {
+		lrp = XFS_INOBT_REC_ADDR(left, nrec, cur);
+		rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
+		*lrp = *rrp;
+		xfs_inobt_log_recs(cur, lbp, nrec, nrec);
+	}
+	/*
+	 * Bump and log left's numrecs, decrement and log right's numrecs.
+	 */
+	INT_MOD(left->bb_numrecs, ARCH_CONVERT, +1);
+	xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
+#ifdef DEBUG
+	if (level > 0)
+		xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
+	else
+		xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
+#endif
+	INT_MOD(right->bb_numrecs, ARCH_CONVERT, -1);
+	xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
+	/*
+	 * Slide the contents of right down one entry.
+	 */
+	if (level > 0) {
+#ifdef DEBUG
+		for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
+			if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i + 1], ARCH_CONVERT),
+					level)))
+				return error;
+		}
+#endif
+		ovbcopy(rkp + 1, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
+		ovbcopy(rpp + 1, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
+		xfs_inobt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		xfs_inobt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+	} else {
+		ovbcopy(rrp + 1, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+		xfs_inobt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		key.ir_startino = rrp->ir_startino; /* INT_: direct copy */
+		rkp = &key;
+	}
+	/*
+	 * Update the parent key values of right.
+	 */
+	if ((error = xfs_inobt_updkey(cur, rkp, level + 1)))
+		return error;
+	/*
+	 * Slide the cursor value left one.
+	 */
+	cur->bc_ptrs[level]--;
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Allocate a new root block, fill it in.
+ */
+STATIC int				/* error */
+xfs_inobt_newroot(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			*stat)	/* success/failure */
+{
+	xfs_agi_t		*agi;	/* a.g. inode header */
+	xfs_alloc_arg_t		args;	/* allocation argument structure */
+	xfs_inobt_block_t	*block; /* one half of the old root block */
+	xfs_buf_t		*bp;	/* buffer containing block */
+	int			error;	/* error return value */
+	xfs_inobt_key_t		*kp;	/* btree key pointer */
+	xfs_agblock_t		lbno;	/* left block number */
+	xfs_buf_t		*lbp;	/* left buffer pointer */
+	xfs_inobt_block_t	*left;	/* left btree block */
+	xfs_buf_t		*nbp;	/* new (root) buffer */
+	xfs_inobt_block_t	*new;	/* new (root) btree block */
+	int			nptr;	/* new value for key index, 1 or 2 */
+	xfs_inobt_ptr_t		*pp;	/* btree address pointer */
+	xfs_agblock_t		rbno;	/* right block number */
+	xfs_buf_t		*rbp;	/* right buffer pointer */
+	xfs_inobt_block_t	*right; /* right btree block */
+	xfs_inobt_rec_t		*rp;	/* btree record pointer */
+
+	ASSERT(cur->bc_nlevels < XFS_IN_MAXLEVELS(cur->bc_mp));
+
+	/*
+	 * Get a block & a buffer.
+	 */
+	agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp);
+	args.tp = cur->bc_tp;
+	args.mp = cur->bc_mp;
+	args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno,
+		INT_GET(agi->agi_root, ARCH_CONVERT));
+	args.mod = args.minleft = args.alignment = args.total = args.wasdel =
+		args.isfl = args.userdata = args.minalignslop = 0;
+	args.minlen = args.maxlen = args.prod = 1;
+	args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	if ((error = xfs_alloc_vextent(&args)))
+		return error;
+	/*
+	 * None available, we fail.
+	 */
+	if (args.fsbno == NULLFSBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	ASSERT(args.len == 1);
+	nbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
+	new = XFS_BUF_TO_INOBT_BLOCK(nbp);
+	/*
+	 * Set the root data in the a.g. inode structure.
+	 */
+	INT_SET(agi->agi_root, ARCH_CONVERT, args.agbno);
+	INT_MOD(agi->agi_level, ARCH_CONVERT, 1);
+	xfs_ialloc_log_agi(args.tp, cur->bc_private.i.agbp,
+		XFS_AGI_ROOT | XFS_AGI_LEVEL);
+	/*
+	 * At the previous root level there are now two blocks: the old
+	 * root, and the new block generated when it was split.
+	 * We don't know which one the cursor is pointing at, so we
+	 * set up variables "left" and "right" for each case.
+	 */
+	bp = cur->bc_bufs[cur->bc_nlevels - 1];
+	block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, cur->bc_nlevels - 1, bp)))
+		return error;
+#endif
+	if (INT_GET(block->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+		/*
+		 * Our block is left, pick up the right block.
+		 */
+		lbp = bp;
+		lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
+		left = block;
+		rbno = INT_GET(left->bb_rightsib, ARCH_CONVERT);
+		if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
+				rbno, 0, &rbp, XFS_INO_BTREE_REF)))
+			return error;
+		bp = rbp;
+		right = XFS_BUF_TO_INOBT_BLOCK(rbp);
+		if ((error = xfs_btree_check_sblock(cur, right,
+				cur->bc_nlevels - 1, rbp)))
+			return error;
+		nptr = 1;
+	} else {
+		/*
+		 * Our block is right, pick up the left block.
+		 */
+		rbp = bp;
+		rbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(rbp));
+		right = block;
+		lbno = INT_GET(right->bb_leftsib, ARCH_CONVERT);
+		if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
+				lbno, 0, &lbp, XFS_INO_BTREE_REF)))
+			return error;
+		bp = lbp;
+		left = XFS_BUF_TO_INOBT_BLOCK(lbp);
+		if ((error = xfs_btree_check_sblock(cur, left,
+				cur->bc_nlevels - 1, lbp)))
+			return error;
+		nptr = 2;
+	}
+	/*
+	 * Fill in the new block's btree header and log it.
+	 */
+	INT_SET(new->bb_magic, ARCH_CONVERT, xfs_magics[cur->bc_btnum]);
+	INT_SET(new->bb_level, ARCH_CONVERT, (__uint16_t)cur->bc_nlevels);
+	INT_SET(new->bb_numrecs, ARCH_CONVERT, 2);
+	INT_SET(new->bb_leftsib, ARCH_CONVERT, NULLAGBLOCK);
+	INT_SET(new->bb_rightsib, ARCH_CONVERT, NULLAGBLOCK);
+	xfs_inobt_log_block(args.tp, nbp, XFS_BB_ALL_BITS);
+	ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
+	/*
+	 * Fill in the key data in the new root.
+	 */
+	kp = XFS_INOBT_KEY_ADDR(new, 1, cur);
+	if (INT_GET(left->bb_level, ARCH_CONVERT) > 0) {
+		kp[0] = *XFS_INOBT_KEY_ADDR(left, 1, cur); /* INT_: struct copy */
+		kp[1] = *XFS_INOBT_KEY_ADDR(right, 1, cur); /* INT_: struct copy */
+	} else {
+		rp = XFS_INOBT_REC_ADDR(left, 1, cur);
+		INT_COPY(kp[0].ir_startino, rp->ir_startino, ARCH_CONVERT);
+		rp = XFS_INOBT_REC_ADDR(right, 1, cur);
+		INT_COPY(kp[1].ir_startino, rp->ir_startino, ARCH_CONVERT);
+	}
+	xfs_inobt_log_keys(cur, nbp, 1, 2);
+	/*
+	 * Fill in the pointer data in the new root.
+	 */
+	pp = XFS_INOBT_PTR_ADDR(new, 1, cur);
+	INT_SET(pp[0], ARCH_CONVERT, lbno);
+	INT_SET(pp[1], ARCH_CONVERT, rbno);
+	xfs_inobt_log_ptrs(cur, nbp, 1, 2);
+	/*
+	 * Fix up the cursor.
+	 */
+	xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
+	cur->bc_ptrs[cur->bc_nlevels] = nptr;
+	cur->bc_nlevels++;
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int				/* error */
+xfs_inobt_rshift(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level to shift record on */
+	int			*stat)	/* success/failure */
+{
+	int			error;	/* error return value */
+	int			i;	/* loop index */
+	xfs_inobt_key_t		key;	/* key value for leaf level upward */
+	xfs_buf_t		*lbp;	/* buffer for left (current) block */
+	xfs_inobt_block_t	*left;	/* left (current) btree block */
+	xfs_inobt_key_t		*lkp;	/* key pointer for left block */
+	xfs_inobt_ptr_t		*lpp;	/* address pointer for left block */
+	xfs_inobt_rec_t		*lrp;	/* record pointer for left block */
+	xfs_buf_t		*rbp;	/* buffer for right neighbor block */
+	xfs_inobt_block_t	*right; /* right neighbor btree block */
+	xfs_inobt_key_t		*rkp;	/* key pointer for right block */
+	xfs_inobt_ptr_t		*rpp;	/* address pointer for right block */
+	xfs_inobt_rec_t		*rrp=NULL;	/* record pointer for right block */
+	xfs_btree_cur_t		*tcur;	/* temporary cursor */
+
+	/*
+	 * Set up variables for this block as "left".
+	 */
+	lbp = cur->bc_bufs[level];
+	left = XFS_BUF_TO_INOBT_BLOCK(lbp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+		return error;
+#endif
+	/*
+	 * If we've got no right sibling then we can't shift an entry right.
+	 */
+	if (INT_GET(left->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * If the cursor entry is the one that would be moved, don't
+	 * do it... it's too complicated.
+	 */
+	if (cur->bc_ptrs[level] >= INT_GET(left->bb_numrecs, ARCH_CONVERT)) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * Set up the right neighbor as "right".
+	 */
+	if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+			cur->bc_private.i.agno, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0, &rbp,
+			XFS_INO_BTREE_REF)))
+		return error;
+	right = XFS_BUF_TO_INOBT_BLOCK(rbp);
+	if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
+		return error;
+	/*
+	 * If it's full, it can't take another entry.
+	 */
+	if (INT_GET(right->bb_numrecs, ARCH_CONVERT) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * Make a hole at the start of the right neighbor block, then
+	 * copy the last left block entry to the hole.
+	 */
+	if (level > 0) {
+		lkp = XFS_INOBT_KEY_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+		lpp = XFS_INOBT_PTR_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+		rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
+		rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+		for (i = INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1; i >= 0; i--) {
+			if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level)))
+				return error;
+		}
+#endif
+		ovbcopy(rkp, rkp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
+		ovbcopy(rpp, rpp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sptr(cur, INT_GET(*lpp, ARCH_CONVERT), level)))
+			return error;
+#endif
+		*rkp = *lkp; /* INT_: no change copy */
+		*rpp = *lpp; /* INT_: no change copy */
+		xfs_inobt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+		xfs_inobt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+	} else {
+		lrp = XFS_INOBT_REC_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT), cur);
+		rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
+		ovbcopy(rrp, rrp + 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+		*rrp = *lrp;
+		xfs_inobt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1);
+		key.ir_startino = rrp->ir_startino; /* INT_: direct copy */
+		rkp = &key;
+	}
+	/*
+	 * Decrement and log left's numrecs, bump and log right's numrecs.
+	 */
+	INT_MOD(left->bb_numrecs, ARCH_CONVERT, -1);
+	xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
+	INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
+#ifdef DEBUG
+	if (level > 0)
+		xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
+	else
+		xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
+#endif
+	xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
+	/*
+	 * Using a temporary cursor, update the parent key values of the
+	 * block on the right.
+	 */
+	if ((error = xfs_btree_dup_cursor(cur, &tcur)))
+		return error;
+	xfs_btree_lastrec(tcur, level);
+	if ((error = xfs_inobt_increment(tcur, level, &i)) ||
+	    (error = xfs_inobt_updkey(tcur, rkp, level + 1))) {
+		xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+		return error;
+	}
+	xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Split cur/level block in half.
+ * Return new block number and its first record (to be inserted into parent).
+ */
+STATIC int				/* error */
+xfs_inobt_split(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level to split */
+	xfs_agblock_t		*bnop,	/* output: block number allocated */
+	xfs_inobt_key_t		*keyp,	/* output: first key of new block */
+	xfs_btree_cur_t		**curp, /* output: new cursor */
+	int			*stat)	/* success/failure */
+{
+	xfs_alloc_arg_t		args;	/* allocation argument structure */
+	int			error;	/* error return value */
+	int			i;	/* loop index/record number */
+	xfs_agblock_t		lbno;	/* left (current) block number */
+	xfs_buf_t		*lbp;	/* buffer for left block */
+	xfs_inobt_block_t	*left;	/* left (current) btree block */
+	xfs_inobt_key_t		*lkp;	/* left btree key pointer */
+	xfs_inobt_ptr_t		*lpp;	/* left btree address pointer */
+	xfs_inobt_rec_t		*lrp;	/* left btree record pointer */
+	xfs_buf_t		*rbp;	/* buffer for right block */
+	xfs_inobt_block_t	*right; /* right (new) btree block */
+	xfs_inobt_key_t		*rkp;	/* right btree key pointer */
+	xfs_inobt_ptr_t		*rpp;	/* right btree address pointer */
+	xfs_inobt_rec_t		*rrp;	/* right btree record pointer */
+
+	/*
+	 * Set up left block (current one).
+	 */
+	lbp = cur->bc_bufs[level];
+	args.tp = cur->bc_tp;
+	args.mp = cur->bc_mp;
+	lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
+	/*
+	 * Allocate the new block.
+	 * If we can't do it, we're toast.  Give up.
+	 */
+	args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno, lbno);
+	args.mod = args.minleft = args.alignment = args.total = args.wasdel =
+		args.isfl = args.userdata = args.minalignslop = 0;
+	args.minlen = args.maxlen = args.prod = 1;
+	args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	if ((error = xfs_alloc_vextent(&args)))
+		return error;
+	if (args.fsbno == NULLFSBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	ASSERT(args.len == 1);
+	rbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
+	/*
+	 * Set up the new block as "right".
+	 */
+	right = XFS_BUF_TO_INOBT_BLOCK(rbp);
+	/*
+	 * "Left" is the current (according to the cursor) block.
+	 */
+	left = XFS_BUF_TO_INOBT_BLOCK(lbp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+		return error;
+#endif
+	/*
+	 * Fill in the btree header for the new block.
+	 */
+	INT_SET(right->bb_magic, ARCH_CONVERT, xfs_magics[cur->bc_btnum]);
+	right->bb_level = left->bb_level; /* INT_: direct copy */
+	INT_SET(right->bb_numrecs, ARCH_CONVERT, (__uint16_t)(INT_GET(left->bb_numrecs, ARCH_CONVERT) / 2));
+	/*
+	 * Make sure that if there's an odd number of entries now, that
+	 * each new block will have the same number of entries.
+	 */
+	if ((INT_GET(left->bb_numrecs, ARCH_CONVERT) & 1) &&
+	    cur->bc_ptrs[level] <= INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1)
+		INT_MOD(right->bb_numrecs, ARCH_CONVERT, +1);
+	i = INT_GET(left->bb_numrecs, ARCH_CONVERT) - INT_GET(right->bb_numrecs, ARCH_CONVERT) + 1;
+	/*
+	 * For non-leaf blocks, copy keys and addresses over to the new block.
+	 */
+	if (level > 0) {
+		lkp = XFS_INOBT_KEY_ADDR(left, i, cur);
+		lpp = XFS_INOBT_PTR_ADDR(left, i, cur);
+		rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
+		rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
+#ifdef DEBUG
+		for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) {
+			if ((error = xfs_btree_check_sptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level)))
+				return error;
+		}
+#endif
+		bcopy(lkp, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rkp));
+		bcopy(lpp, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rpp));
+		xfs_inobt_log_keys(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		xfs_inobt_log_ptrs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		*keyp = *rkp;
+	}
+	/*
+	 * For leaf blocks, copy records over to the new block.
+	 */
+	else {
+		lrp = XFS_INOBT_REC_ADDR(left, i, cur);
+		rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
+		bcopy(lrp, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*rrp));
+		xfs_inobt_log_recs(cur, rbp, 1, INT_GET(right->bb_numrecs, ARCH_CONVERT));
+		keyp->ir_startino = rrp->ir_startino; /* INT_: direct copy */
+	}
+	/*
+	 * Find the left block number by looking in the buffer.
+	 * Adjust numrecs, sibling pointers.
+	 */
+	INT_MOD(left->bb_numrecs, ARCH_CONVERT, -(INT_GET(right->bb_numrecs, ARCH_CONVERT)));
+	right->bb_rightsib = left->bb_rightsib; /* INT_: direct copy */
+	INT_SET(left->bb_rightsib, ARCH_CONVERT, args.agbno);
+	INT_SET(right->bb_leftsib, ARCH_CONVERT, lbno);
+	xfs_inobt_log_block(args.tp, rbp, XFS_BB_ALL_BITS);
+	xfs_inobt_log_block(args.tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+	/*
+	 * If there's a block to the new block's right, make that block
+	 * point back to right instead of to left.
+	 */
+	if (INT_GET(right->bb_rightsib, ARCH_CONVERT) != NULLAGBLOCK) {
+		xfs_inobt_block_t	*rrblock;	/* rr btree block */
+		xfs_buf_t		*rrbp;		/* buffer for rrblock */
+
+		if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
+				INT_GET(right->bb_rightsib, ARCH_CONVERT), 0, &rrbp,
+				XFS_INO_BTREE_REF)))
+			return error;
+		rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
+		if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
+			return error;
+		INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, args.agbno);
+		xfs_inobt_log_block(args.tp, rrbp, XFS_BB_LEFTSIB);
+	}
+	/*
+	 * If the cursor is really in the right block, move it there.
+	 * If it's just pointing past the last entry in left, then we'll
+	 * insert there, so don't change anything in that case.
+	 */
+	if (cur->bc_ptrs[level] > INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1) {
+		xfs_btree_setbuf(cur, level, rbp);
+		cur->bc_ptrs[level] -= INT_GET(left->bb_numrecs, ARCH_CONVERT);
+	}
+	/*
+	 * If there are more levels, we'll need another cursor which refers
+	 * the right block, no matter where this cursor was.
+	 */
+	if (level + 1 < cur->bc_nlevels) {
+		if ((error = xfs_btree_dup_cursor(cur, curp)))
+			return error;
+		(*curp)->bc_ptrs[level + 1]++;
+	}
+	*bnop = args.agbno;
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Update keys at all levels from here to the root along the cursor's path.
+ */
+STATIC int				/* error */
+xfs_inobt_updkey(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_inobt_key_t		*keyp,	/* new key value to update to */
+	int			level)	/* starting level for update */
+{
+	int			ptr;	/* index of key in block */
+
+	/*
+	 * Go up the tree from this level toward the root.
+	 * At each level, update the key value to the value input.
+	 * Stop when we reach a level where the cursor isn't pointing
+	 * at the first entry in the block.
+	 */
+	for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+		xfs_buf_t		*bp;	/* buffer for block */
+		xfs_inobt_block_t	*block; /* btree block */
+#ifdef DEBUG
+		int			error;	/* error return value */
+#endif
+		xfs_inobt_key_t		*kp;	/* ptr to btree block keys */
+
+		bp = cur->bc_bufs[level];
+		block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+			return error;
+#endif
+		ptr = cur->bc_ptrs[level];
+		kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
+		*kp = *keyp;
+		xfs_inobt_log_keys(cur, bp, ptr, ptr);
+	}
+	return 0;
+}
+
+/*
+ * Externally visible routines.
+ */
+
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int					/* error */
+xfs_inobt_decrement(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level in btree, 0 is leaf */
+	int			*stat)	/* success/failure */
+{
+	xfs_inobt_block_t	*block; /* btree block */
+	int			error;
+	int			lev;	/* btree level */
+
+	ASSERT(level < cur->bc_nlevels);
+	/*
+	 * Read-ahead to the left at this level.
+	 */
+	xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+	/*
+	 * Decrement the ptr at this level.  If we're still in the block
+	 * then we're done.
+	 */
+	if (--cur->bc_ptrs[level] > 0) {
+		*stat = 1;
+		return 0;
+	}
+	/*
+	 * Get a pointer to the btree block.
+	 */
+	block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[level]);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, level,
+			cur->bc_bufs[level])))
+		return error;
+#endif
+	/*
+	 * If we just went off the left edge of the tree, return failure.
+	 */
+	if (INT_GET(block->bb_leftsib, ARCH_CONVERT) == NULLAGBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * March up the tree decrementing pointers.
+	 * Stop when we don't go off the left edge of a block.
+	 */
+	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+		if (--cur->bc_ptrs[lev] > 0)
+			break;
+		/*
+		 * Read-ahead the left block, we're going to read it
+		 * in the next loop.
+		 */
+		xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+	}
+	/*
+	 * If we went off the root then we are seriously confused.
+	 */
+	ASSERT(lev < cur->bc_nlevels);
+	/*
+	 * Now walk back down the tree, fixing up the cursor's buffer
+	 * pointers and key numbers.
+	 */
+	for (block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
+		xfs_agblock_t	agbno;	/* block number of btree block */
+		xfs_buf_t	*bp;	/* buffer containing btree block */
+
+		agbno = INT_GET(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
+		if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+				cur->bc_private.i.agno, agbno, 0, &bp,
+				XFS_INO_BTREE_REF)))
+			return error;
+		lev--;
+		xfs_btree_setbuf(cur, lev, bp);
+		block = XFS_BUF_TO_INOBT_BLOCK(bp);
+		if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
+			return error;
+		cur->bc_ptrs[lev] = INT_GET(block->bb_numrecs, ARCH_CONVERT);
+	}
+	*stat = 1;
+	return 0;
+}
+
+#ifdef _NOTYET_
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int					/* error */
+xfs_inobt_delete(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	int		*stat)		/* success/failure */
+{
+	int		error;
+	int		i;		/* result code */
+	int		level;		/* btree level */
+
+	/*
+	 * Go up the tree, starting at leaf level.
+	 * If 2 is returned then a join was done; go to the next level.
+	 * Otherwise we are done.
+	 */
+	for (level = 0, i = 2; i == 2; level++) {
+		if (error = xfs_inobt_delrec(cur, level, &i))
+			return error;
+	}
+	if (i == 0) {
+		for (level = 1; level < cur->bc_nlevels; level++) {
+			if (cur->bc_ptrs[level] == 0) {
+				if (error = xfs_inobt_decrement(cur, level, &i))
+					return error;
+				break;
+			}
+		}
+	}
+	*stat = i;
+	return 0;
+}
+#endif	/* _NOTYET_ */
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int					/* error */
+xfs_inobt_get_rec(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_agino_t		*ino,	/* output: starting inode of chunk */
+	__int32_t		*fcnt,	/* output: number of free inodes */
+	xfs_inofree_t		*free,	/* output: free inode mask */
+	int			*stat,	/* output: success/failure */
+	xfs_arch_t		arch)	/* input: architecture */
+{
+	xfs_inobt_block_t	*block; /* btree block */
+	xfs_buf_t		*bp;	/* buffer containing btree block */
+#ifdef DEBUG
+	int			error;	/* error return value */
+#endif
+	int			ptr;	/* record number */
+	xfs_inobt_rec_t		*rec;	/* record data */
+
+	bp = cur->bc_bufs[0];
+	ptr = cur->bc_ptrs[0];
+	block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
+		return error;
+#endif
+	/*
+	 * Off the right end or left end, return failure.
+	 */
+	if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT) || ptr <= 0) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * Point to the record and extract its data.
+	 */
+	rec = XFS_INOBT_REC_ADDR(block, ptr, cur);
+	ASSERT(arch == ARCH_NOCONVERT || arch == ARCH_CONVERT);
+	if (arch == ARCH_NOCONVERT) {
+	    *ino = INT_GET(rec->ir_startino, ARCH_CONVERT);
+	    *fcnt = INT_GET(rec->ir_freecount, ARCH_CONVERT);
+	    *free = INT_GET(rec->ir_free, ARCH_CONVERT);
+	} else {
+	    INT_COPY(*ino, rec->ir_startino, ARCH_CONVERT);
+	    INT_COPY(*fcnt, rec->ir_freecount, ARCH_CONVERT);
+	    INT_COPY(*free, rec->ir_free, ARCH_CONVERT);
+	}
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int					/* error */
+xfs_inobt_increment(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	int			level,	/* level in btree, 0 is leaf */
+	int			*stat)	/* success/failure */
+{
+	xfs_inobt_block_t	*block; /* btree block */
+	xfs_buf_t		*bp;	/* buffer containing btree block */
+	int			error;	/* error return value */
+	int			lev;	/* btree level */
+
+	ASSERT(level < cur->bc_nlevels);
+	/*
+	 * Read-ahead to the right at this level.
+	 */
+	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+	/*
+	 * Get a pointer to the btree block.
+	 */
+	bp = cur->bc_bufs[level];
+	block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+		return error;
+#endif
+	/*
+	 * Increment the ptr at this level.  If we're still in the block
+	 * then we're done.
+	 */
+	if (++cur->bc_ptrs[level] <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) {
+		*stat = 1;
+		return 0;
+	}
+	/*
+	 * If we just went off the right edge of the tree, return failure.
+	 */
+	if (INT_GET(block->bb_rightsib, ARCH_CONVERT) == NULLAGBLOCK) {
+		*stat = 0;
+		return 0;
+	}
+	/*
+	 * March up the tree incrementing pointers.
+	 * Stop when we don't go off the right edge of a block.
+	 */
+	for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+		bp = cur->bc_bufs[lev];
+		block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+		if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
+			return error;
+#endif
+		if (++cur->bc_ptrs[lev] <= INT_GET(block->bb_numrecs, ARCH_CONVERT))
+			break;
+		/*
+		 * Read-ahead the right block, we're going to read it
+		 * in the next loop.
+		 */
+		xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+	}
+	/*
+	 * If we went off the root then we are seriously confused.
+	 */
+	ASSERT(lev < cur->bc_nlevels);
+	/*
+	 * Now walk back down the tree, fixing up the cursor's buffer
+	 * pointers and key numbers.
+	 */
+	for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_INOBT_BLOCK(bp);
+	     lev > level; ) {
+		xfs_agblock_t	agbno;	/* block number of btree block */
+
+		agbno = INT_GET(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
+		if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+				cur->bc_private.i.agno, agbno, 0, &bp,
+				XFS_INO_BTREE_REF)))
+			return error;
+		lev--;
+		xfs_btree_setbuf(cur, lev, bp);
+		block = XFS_BUF_TO_INOBT_BLOCK(bp);
+		if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
+			return error;
+		cur->bc_ptrs[lev] = 1;
+	}
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Insert the current record at the point referenced by cur.
+ * The cursor may be inconsistent on return if splits have been done.
+ */
+int					/* error */
+xfs_inobt_insert(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	int		*stat)		/* success/failure */
+{
+	int		error;		/* error return value */
+	int		i;		/* result value, 0 for failure */
+	int		level;		/* current level number in btree */
+	xfs_agblock_t	nbno;		/* new block number (split result) */
+	xfs_btree_cur_t *ncur;		/* new cursor (split result) */
+	xfs_inobt_rec_t nrec;		/* record being inserted this level */
+	xfs_btree_cur_t *pcur;		/* previous level's cursor */
+
+	level = 0;
+	nbno = NULLAGBLOCK;
+	INT_SET(nrec.ir_startino, ARCH_CONVERT, cur->bc_rec.i.ir_startino);
+	INT_SET(nrec.ir_freecount, ARCH_CONVERT, cur->bc_rec.i.ir_freecount);
+	INT_SET(nrec.ir_free, ARCH_CONVERT, cur->bc_rec.i.ir_free);
+	ncur = (xfs_btree_cur_t *)0;
+	pcur = cur;
+	/*
+	 * Loop going up the tree, starting at the leaf level.
+	 * Stop when we don't get a split block, that must mean that
+	 * the insert is finished with this level.
+	 */
+	do {
+		/*
+		 * Insert nrec/nbno into this level of the tree.
+		 * Note if we fail, nbno will be null.
+		 */
+		if ((error = xfs_inobt_insrec(pcur, level++, &nbno, &nrec, &ncur,
+				&i))) {
+			if (pcur != cur)
+				xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+			return error;
+		}
+		/*
+		 * See if the cursor we just used is trash.
+		 * Can't trash the caller's cursor, but otherwise we should
+		 * if ncur is a new cursor or we're about to be done.
+		 */
+		if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
+			cur->bc_nlevels = pcur->bc_nlevels;
+			xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+		}
+		/*
+		 * If we got a new cursor, switch to it.
+		 */
+		if (ncur) {
+			pcur = ncur;
+			ncur = (xfs_btree_cur_t *)0;
+		}
+	} while (nbno != NULLAGBLOCK);
+	*stat = i;
+	return 0;
+}
+
+/*
+ * Lookup the record equal to ino in the btree given by cur.
+ */
+int					/* error */
+xfs_inobt_lookup_eq(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	xfs_agino_t	ino,		/* starting inode of chunk */
+	__int32_t	fcnt,		/* free inode count */
+	xfs_inofree_t	free,		/* free inode mask */
+	int		*stat)		/* success/failure */
+{
+	cur->bc_rec.i.ir_startino = ino;
+	cur->bc_rec.i.ir_freecount = fcnt;
+	cur->bc_rec.i.ir_free = free;
+	return xfs_inobt_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/*
+ * Lookup the first record greater than or equal to ino
+ * in the btree given by cur.
+ */
+int					/* error */
+xfs_inobt_lookup_ge(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	xfs_agino_t	ino,		/* starting inode of chunk */
+	__int32_t	fcnt,		/* free inode count */
+	xfs_inofree_t	free,		/* free inode mask */
+	int		*stat)		/* success/failure */
+{
+	cur->bc_rec.i.ir_startino = ino;
+	cur->bc_rec.i.ir_freecount = fcnt;
+	cur->bc_rec.i.ir_free = free;
+	return xfs_inobt_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+ * Lookup the first record less than or equal to ino
+ * in the btree given by cur.
+ */
+int					/* error */
+xfs_inobt_lookup_le(
+	xfs_btree_cur_t *cur,		/* btree cursor */
+	xfs_agino_t	ino,		/* starting inode of chunk */
+	__int32_t	fcnt,		/* free inode count */
+	xfs_inofree_t	free,		/* free inode mask */
+	int		*stat)		/* success/failure */
+{
+	cur->bc_rec.i.ir_startino = ino;
+	cur->bc_rec.i.ir_freecount = fcnt;
+	cur->bc_rec.i.ir_free = free;
+	return xfs_inobt_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Update the record referred to by cur, to the value given
+ * by [ino, fcnt, free].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+int					/* error */
+xfs_inobt_update(
+	xfs_btree_cur_t		*cur,	/* btree cursor */
+	xfs_agino_t		ino,	/* starting inode of chunk */
+	__int32_t		fcnt,	/* free inode count */
+	xfs_inofree_t		free)	/* free inode mask */
+{
+	xfs_inobt_block_t	*block; /* btree block to update */
+	xfs_buf_t		*bp;	/* buffer containing btree block */
+	int			error;	/* error return value */
+	int			ptr;	/* current record number (updating) */
+	xfs_inobt_rec_t		*rp;	/* pointer to updated record */
+
+	/*
+	 * Pick up the current block.
+	 */
+	bp = cur->bc_bufs[0];
+	block = XFS_BUF_TO_INOBT_BLOCK(bp);
+#ifdef DEBUG
+	if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
+		return error;
+#endif
+	/*
+	 * Get the address of the rec to be updated.
+	 */
+	ptr = cur->bc_ptrs[0];
+	rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
+	/*
+	 * Fill in the new contents and log them.
+	 */
+	INT_SET(rp->ir_startino, ARCH_CONVERT, ino);
+	INT_SET(rp->ir_freecount, ARCH_CONVERT, fcnt);
+	INT_SET(rp->ir_free, ARCH_CONVERT, free);
+	xfs_inobt_log_recs(cur, bp, ptr, ptr);
+	/*
+	 * Updating first record in leaf. Pass new key value up to our parent.
+	 */
+	if (ptr == 1) {
+		xfs_inobt_key_t key;	/* key containing [ino] */
+
+		INT_SET(key.ir_startino, ARCH_CONVERT, ino);
+		if ((error = xfs_inobt_updkey(cur, &key, 1)))
+			return error;
+	}
+	return 0;
+}
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
new file mode 100644
index 000000000000..fd27bc62bf44
--- /dev/null
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_IALLOC_BTREE_H__
+#define __XFS_IALLOC_BTREE_H__
+
+/*
+ * Inode map on-disk structures
+ */
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_btree_sblock;
+struct xfs_mount;
+
+/*
+ * There is a btree for the inode map per allocation group.
+ */
+#define XFS_IBT_MAGIC	0x49414254	/* 'IABT' */
+
+typedef __uint64_t	xfs_inofree_t;
+#define XFS_INODES_PER_CHUNK	(NBBY * sizeof(xfs_inofree_t))
+#define XFS_INODES_PER_CHUNK_LOG	(XFS_NBBYLOG + 3)
+#define XFS_INOBT_ALL_FREE	((xfs_inofree_t)-1)
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_MASKN)
+xfs_inofree_t xfs_inobt_maskn(int i, int n);
+#define XFS_INOBT_MASKN(i,n)		xfs_inobt_maskn(i,n)
+#else
+#define XFS_INOBT_MASKN(i,n)	\
+	((((n) >= XFS_INODES_PER_CHUNK ? \
+		(xfs_inofree_t)0 : ((xfs_inofree_t)1 << (n))) - 1) << (i))
+#endif
+
+/*
+ * Data record structure
+ */
+typedef struct xfs_inobt_rec
+{
+	xfs_agino_t	ir_startino;	/* starting inode number */
+	__int32_t	ir_freecount;	/* count of free inodes (set bits) */
+	xfs_inofree_t	ir_free;	/* free inode mask */
+} xfs_inobt_rec_t;
+
+/*
+ * Key structure
+ */
+typedef struct xfs_inobt_key
+{
+	xfs_agino_t	ir_startino;	/* starting inode number */
+} xfs_inobt_key_t;
+
+typedef xfs_agblock_t xfs_inobt_ptr_t;	/* btree pointer type */
+					/* btree block header type */
+typedef struct xfs_btree_sblock xfs_inobt_block_t;
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_INOBT_BLOCK)
+xfs_inobt_block_t *xfs_buf_to_inobt_block(struct xfs_buf *bp);
+#define XFS_BUF_TO_INOBT_BLOCK(bp)	xfs_buf_to_inobt_block(bp)
+#else
+#define XFS_BUF_TO_INOBT_BLOCK(bp) ((xfs_inobt_block_t *)(XFS_BUF_PTR(bp)))
+#endif
+
+/*
+ * Bit manipulations for ir_free.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_MASK)
+xfs_inofree_t xfs_inobt_mask(int i);
+#define XFS_INOBT_MASK(i)		xfs_inobt_mask(i)
+#else
+#define XFS_INOBT_MASK(i)		((xfs_inofree_t)1 << (i))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_IS_FREE)
+int xfs_inobt_is_free(xfs_inobt_rec_t *rp, int i, xfs_arch_t arch);
+#define XFS_INOBT_IS_FREE(rp,i,arch)	xfs_inobt_is_free(rp,i,arch)
+#else
+#define XFS_INOBT_IS_FREE(rp,i,arch)	((INT_GET((rp)->ir_free, arch) \
+					 & XFS_INOBT_MASK(i)) != 0)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_SET_FREE)
+void xfs_inobt_set_free(xfs_inobt_rec_t *rp, int i, xfs_arch_t arch);
+#define XFS_INOBT_SET_FREE(rp,i,arch)	xfs_inobt_set_free(rp,i,arch)
+#else
+#define XFS_INOBT_SET_FREE(rp,i,arch)	(INT_MOD_EXPR((rp)->ir_free, arch, |= XFS_INOBT_MASK(i)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_CLR_FREE)
+void xfs_inobt_clr_free(xfs_inobt_rec_t *rp, int i, xfs_arch_t arch);
+#define XFS_INOBT_CLR_FREE(rp,i,arch)	xfs_inobt_clr_free(rp,i,arch)
+#else
+#define XFS_INOBT_CLR_FREE(rp,i,arch)	(INT_MOD_EXPR((rp)->ir_free, arch, &= ~XFS_INOBT_MASK(i)))
+#endif
+
+/*
+ * Real block structures have a size equal to the disk block size.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_BLOCK_SIZE)
+int xfs_inobt_block_size(int lev, struct xfs_btree_cur *cur);
+#define XFS_INOBT_BLOCK_SIZE(lev,cur)	xfs_inobt_block_size(lev,cur)
+#else
+#define XFS_INOBT_BLOCK_SIZE(lev,cur)	(1 << (cur)->bc_blocklog)
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_BLOCK_MAXRECS)
+int xfs_inobt_block_maxrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_INOBT_BLOCK_MAXRECS(lev,cur)	xfs_inobt_block_maxrecs(lev,cur)
+#else
+#define XFS_INOBT_BLOCK_MAXRECS(lev,cur)	\
+	((cur)->bc_mp->m_inobt_mxr[lev != 0])
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_BLOCK_MINRECS)
+int xfs_inobt_block_minrecs(int lev, struct xfs_btree_cur *cur);
+#define XFS_INOBT_BLOCK_MINRECS(lev,cur)	xfs_inobt_block_minrecs(lev,cur)
+#else
+#define XFS_INOBT_BLOCK_MINRECS(lev,cur)	\
+	((cur)->bc_mp->m_inobt_mnr[lev != 0])
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_IS_LAST_REC)
+int xfs_inobt_is_last_rec(struct xfs_btree_cur *cur);
+#define XFS_INOBT_IS_LAST_REC(cur)	xfs_inobt_is_last_rec(cur)
+#else
+#define XFS_INOBT_IS_LAST_REC(cur)	\
+	((cur)->bc_ptrs[0] == \
+		INT_GET(XFS_BUF_TO_INOBT_BLOCK((cur)->bc_bufs[0])->bb_numrecs, ARCH_CONVERT))
+#endif
+
+/*
+ * Maximum number of inode btree levels.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IN_MAXLEVELS)
+int xfs_in_maxlevels(struct xfs_mount *mp);
+#define XFS_IN_MAXLEVELS(mp)		xfs_in_maxlevels(mp)
+#else
+#define XFS_IN_MAXLEVELS(mp)		((mp)->m_in_maxlevels)
+#endif
+
+/*
+ * block numbers in the AG.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IBT_BLOCK)
+xfs_agblock_t xfs_ibt_block(struct xfs_mount *mp);
+#define XFS_IBT_BLOCK(mp)		xfs_ibt_block(mp)
+#else
+#define XFS_IBT_BLOCK(mp)	((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_PREALLOC_BLOCKS)
+xfs_agblock_t xfs_prealloc_blocks(struct xfs_mount *mp);
+#define XFS_PREALLOC_BLOCKS(mp)		xfs_prealloc_blocks(mp)
+#else
+#define XFS_PREALLOC_BLOCKS(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
+#endif
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_REC_ADDR)
+xfs_inobt_rec_t *
+xfs_inobt_rec_addr(xfs_inobt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_INOBT_REC_ADDR(bb,i,cur)	xfs_inobt_rec_addr(bb,i,cur)
+#else
+#define XFS_INOBT_REC_ADDR(bb,i,cur)	\
+	XFS_BTREE_REC_ADDR(XFS_INOBT_BLOCK_SIZE(0,cur), xfs_inobt, bb, i, \
+		XFS_INOBT_BLOCK_MAXRECS(0, cur))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_KEY_ADDR)
+xfs_inobt_key_t *
+xfs_inobt_key_addr(xfs_inobt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_INOBT_KEY_ADDR(bb,i,cur)	xfs_inobt_key_addr(bb,i,cur)
+#else
+#define XFS_INOBT_KEY_ADDR(bb,i,cur)	\
+	XFS_BTREE_KEY_ADDR(XFS_INOBT_BLOCK_SIZE(1,cur), xfs_inobt, bb, i, \
+		XFS_INOBT_BLOCK_MAXRECS(1, cur))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_PTR_ADDR)
+xfs_inobt_ptr_t *
+xfs_inobt_ptr_addr(xfs_inobt_block_t *bb, int i, struct xfs_btree_cur *cur);
+#define XFS_INOBT_PTR_ADDR(bb,i,cur)	xfs_inobt_ptr_addr(bb,i,cur)
+#else
+#define XFS_INOBT_PTR_ADDR(bb,i,cur)	\
+	XFS_BTREE_PTR_ADDR(XFS_INOBT_BLOCK_SIZE(1,cur), xfs_inobt, bb, i, \
+		XFS_INOBT_BLOCK_MAXRECS(1, cur))
+#endif
+
+/*
+ * Prototypes for externally visible routines.
+ */
+
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int					/* error */
+xfs_inobt_decrement(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level,	/* level in btree, 0 is leaf */
+	int			*stat); /* success/failure */
+
+#ifdef _NOTYET_
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int					/* error */
+xfs_inobt_delete(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			*stat); /* success/failure */
+#endif	/* _NOTYET_ */
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int					/* error */
+xfs_inobt_get_rec(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agino_t		*ino,	/* output: starting inode of chunk */
+	__int32_t		*fcnt,	/* output: number of free inodes */
+	xfs_inofree_t		*free,	/* output: free inode mask */
+	int			*stat,	/* output: success/failure */
+	xfs_arch_t		arch);	/* output: architecture */
+
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int					/* error */
+xfs_inobt_increment(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			level,	/* level in btree, 0 is leaf */
+	int			*stat); /* success/failure */
+
+/*
+ * Insert the current record at the point referenced by cur.
+ * The cursor may be inconsistent on return if splits have been done.
+ */
+int					/* error */
+xfs_inobt_insert(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	int			*stat); /* success/failure */
+
+/*
+ * Lookup the record equal to ino in the btree given by cur.
+ */
+int					/* error */
+xfs_inobt_lookup_eq(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agino_t		ino,	/* starting inode of chunk */
+	__int32_t		fcnt,	/* free inode count */
+	xfs_inofree_t		free,	/* free inode mask */
+	int			*stat); /* success/failure */
+
+/*
+ * Lookup the first record greater than or equal to ino
+ * in the btree given by cur.
+ */
+int					/* error */
+xfs_inobt_lookup_ge(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agino_t		ino,	/* starting inode of chunk */
+	__int32_t		fcnt,	/* free inode count */
+	xfs_inofree_t		free,	/* free inode mask */
+	int			*stat); /* success/failure */
+
+/*
+ * Lookup the first record less than or equal to ino
+ * in the btree given by cur.
+ */
+int					/* error */
+xfs_inobt_lookup_le(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agino_t		ino,	/* starting inode of chunk */
+	__int32_t		fcnt,	/* free inode count */
+	xfs_inofree_t		free,	/* free inode mask */
+	int			*stat); /* success/failure */
+
+/*
+ * Update the record referred to by cur, to the value given
+ * by [ino, fcnt, free].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+int					/* error */
+xfs_inobt_update(
+	struct xfs_btree_cur	*cur,	/* btree cursor */
+	xfs_agino_t		ino,	/* starting inode of chunk */
+	__int32_t		fcnt,	/* free inode count */
+	xfs_inofree_t		free);	/* free inode mask */
+
+#endif	/* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
new file mode 100644
index 000000000000..0c33be976d25
--- /dev/null
+++ b/fs/xfs/xfs_iget.c
@@ -0,0 +1,1040 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+/*
+ * Initialize the inode hash table for the newly mounted file system.
+ *
+ * mp -- this is the mount point structure for the file system being
+ *	 initialized
+ */
+void
+xfs_ihash_init(xfs_mount_t *mp)
+{
+	int	i;
+
+	mp->m_ihsize = XFS_BUCKETS(mp);
+	mp->m_ihash = (xfs_ihash_t *)kmem_zalloc(mp->m_ihsize
+				      * sizeof(xfs_ihash_t), KM_SLEEP);
+	ASSERT(mp->m_ihash != NULL);
+	for (i = 0; i < mp->m_ihsize; i++) {
+		rwlock_init(&(mp->m_ihash[i].ih_lock));
+	}
+}
+
+/*
+ * Free up structures allocated by xfs_ihash_init, at unmount time.
+ */
+void
+xfs_ihash_free(xfs_mount_t *mp)
+{
+	kmem_free(mp->m_ihash, mp->m_ihsize*sizeof(xfs_ihash_t));
+	mp->m_ihash = NULL;
+}
+
+/*
+ * Initialize the inode cluster hash table for the newly mounted file system.
+ *
+ * mp -- this is the mount point structure for the file system being
+ *	 initialized
+ */
+void
+xfs_chash_init(xfs_mount_t *mp)
+{
+	int	i;
+
+	/*
+	 * m_chash size is based on m_ihash
+	 * with a minimum of 37 entries
+	 */
+	mp->m_chsize = (XFS_BUCKETS(mp)) /
+			 (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
+	if (mp->m_chsize < 37) {
+		mp->m_chsize = 37;
+	}
+	mp->m_chash = (xfs_chash_t *)kmem_zalloc(mp->m_chsize
+						 * sizeof(xfs_chash_t),
+						 KM_SLEEP);
+	ASSERT(mp->m_chash != NULL);
+
+	for (i = 0; i < mp->m_chsize; i++) {
+		spinlock_init(&mp->m_chash[i].ch_lock,"xfshash");
+	}
+}
+
+/*
+ * Free up structures allocated by xfs_chash_init, at unmount time.
+ */
+void
+xfs_chash_free(xfs_mount_t *mp)
+{
+	int	i;
+
+	for (i = 0; i < mp->m_chsize; i++) {
+		spinlock_destroy(&mp->m_chash[i].ch_lock);
+	}
+
+	kmem_free(mp->m_chash, mp->m_chsize*sizeof(xfs_chash_t));
+	mp->m_chash = NULL;
+}
+
+
+static inline void
+xfs_iget_vnode_init(
+	xfs_mount_t	*mp,
+	vnode_t		*vp,
+	xfs_inode_t	*ip)
+{
+	vp->v_vfsp  = XFS_MTOVFS(mp);
+	vp->v_type  = IFTOVT(ip->i_d.di_mode);
+
+	/* If we have a real type for an on-disk inode, we can set ops(&unlock)
+	 * now.	 If it's a new inode being created, xfs_ialloc will handle it.
+	 */
+	if (vp->v_type != VNON) {
+		linvfs_set_inode_ops(LINVFS_GET_IP(vp));
+	}
+}
+
+
+/*
+ * Look up an inode by number in the given file system.
+ * The inode is looked up in the hash table for the file system
+ * represented by the mount point parameter mp.	 Each bucket of
+ * the hash table is guarded by an individual semaphore.
+ *
+ * If the inode is found in the hash table, its corresponding vnode
+ * is obtained with a call to vn_get().	 This call takes care of
+ * coordination with the reclamation of the inode and vnode.  Note
+ * that the vmap structure is filled in while holding the hash lock.
+ * This gives us the state of the inode/vnode when we found it and
+ * is used for coordination in vn_get().
+ *
+ * If it is not in core, read it in from the file system's device and
+ * add the inode into the hash table.
+ *
+ * The inode is locked according to the value of the lock_flags parameter.
+ * This flag parameter indicates how and if the inode's IO lock and inode lock
+ * should be taken.
+ *
+ * mp -- the mount point structure for the current file system.	 It points
+ *	 to the inode hash table.
+ * tp -- a pointer to the current transaction if there is one.	This is
+ *	 simply passed through to the xfs_iread() call.
+ * ino -- the number of the inode desired.  This is the unique identifier
+ *	  within the file system for the inode being requested.
+ * lock_flags -- flags indicating how to lock the inode.  See the comment
+ *		 for xfs_ilock() for a list of valid values.
+ * bno -- the block number starting the buffer containing the inode,
+ *	  if known (as by bulkstat), else 0.
+ */
+int
+xfs_iget_core(
+	vnode_t		*vp,
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_ino_t	ino,
+	uint		lock_flags,
+	xfs_inode_t	**ipp,
+	xfs_daddr_t	bno)
+{
+	xfs_ihash_t	*ih;
+	xfs_inode_t	*ip;
+	xfs_inode_t	*iq;
+	vnode_t		*inode_vp;
+	ulong		version;
+	int		error;
+	/* REFERENCED */
+	int		newnode;
+	xfs_chash_t	*ch;
+	xfs_chashlist_t *chl, *chlnew;
+	SPLDECL(s);
+
+
+	ih = XFS_IHASH(mp, ino);
+
+again:
+	read_lock(&ih->ih_lock);
+
+	for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {
+		if (ip->i_ino == ino) {
+
+			inode_vp = XFS_ITOV_NULL(ip);
+
+			if (inode_vp == NULL) {
+				/* If IRECLAIM is set this inode is
+				 * on its way out of the system,
+				 * we need to pause and try again.
+				 */
+				if (ip->i_flags & XFS_IRECLAIM) {
+					read_unlock(&ih->ih_lock);
+					delay(1);
+					XFS_STATS_INC(xfsstats.xs_ig_frecycle);
+
+					goto again;
+				}
+
+				xfs_iget_vnode_init(mp, vp, ip);
+
+				vn_trace_exit(vp, "xfs_iget.alloc",
+					(inst_t *)__return_address);
+
+				bhv_desc_init(&(ip->i_bhv_desc), ip, vp,
+							&xfs_vnodeops);
+				vn_bhv_insert_initial(VN_BHV_HEAD(vp),
+							&(ip->i_bhv_desc));
+
+				XFS_STATS_INC(xfsstats.xs_ig_found);
+
+				read_unlock(&ih->ih_lock);
+				goto finish_inode;
+
+			} else if (vp != inode_vp) {
+				struct inode *inode = LINVFS_GET_IP(inode_vp);
+
+				/* The inode is being torn down, pause and
+				 * try again.
+				 */
+				if (inode->i_state & (I_FREEING | I_CLEAR)) {
+					read_unlock(&ih->ih_lock);
+					delay(1);
+					XFS_STATS_INC(xfsstats.xs_ig_frecycle);
+
+					goto again;
+				}
+/* Chances are the other vnode (the one in the inode) is being torn
+ * down right now, and we landed on top of it. Question is, what do
+ * we do? Unhook the old inode and hook up the new one?
+ */
+				cmn_err(CE_PANIC,
+			"xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p",
+						inode_vp, vp);
+				BUG();
+			}
+
+			read_unlock(&ih->ih_lock);
+
+			XFS_STATS_INC(xfsstats.xs_ig_found);
+
+			/*
+			 * Make sure the vnode and the inode are hooked up
+			 */
+			xfs_iget_vnode_init(mp, vp, ip);
+
+finish_inode:
+			if (lock_flags != 0) {
+				xfs_ilock(ip, lock_flags);
+			}
+
+			newnode = (ip->i_d.di_mode == 0);
+			if (newnode) {
+				xfs_iocore_inode_reinit(ip);
+			}
+			vn_trace_exit(vp, "xfs_iget.found",
+						(inst_t *)__return_address);
+			goto return_ip;
+		}
+	}
+
+	/*
+	 * Inode cache miss: save the hash chain version stamp and unlock
+	 * the chain, so we don't deadlock in vn_alloc.
+	 */
+	XFS_STATS_INC(xfsstats.xs_ig_missed);
+
+	version = ih->ih_version;
+
+	read_unlock(&ih->ih_lock);
+
+	/*
+	 * Read the disk inode attributes into a new inode structure and get
+	 * a new vnode for it.	Initialize the inode lock so we can idestroy
+	 * it soon if it's a dup.  This should also initialize i_dev, i_ino,
+	 * i_bno, i_mount, and i_index.
+	 */
+	error = xfs_iread(mp, tp, ino, &ip, bno);
+	if (error) {
+		return error;
+	}
+
+	/*
+	 * Vnode provided by vn_initialize.
+	 */
+
+	xfs_iget_vnode_init(mp, vp, ip);
+
+	vn_trace_exit(vp, "xfs_iget.alloc", (inst_t *)__return_address);
+
+	if (vp->v_fbhv == NULL) {
+		bhv_desc_init(&(ip->i_bhv_desc), ip, vp, &xfs_vnodeops);
+		vn_bhv_insert_initial(VN_BHV_HEAD(vp), &(ip->i_bhv_desc));
+	}
+
+	xfs_inode_lock_init(ip, vp);
+	xfs_iocore_inode_init(ip);
+
+	if (lock_flags != 0) {
+		xfs_ilock(ip, lock_flags);
+	}
+
+	/*
+	 * Put ip on its hash chain, unless someone else hashed a duplicate
+	 * after we released the hash lock.
+	 */
+	write_lock(&ih->ih_lock);
+
+	if (ih->ih_version != version) {
+		for (iq = ih->ih_next; iq != NULL; iq = iq->i_next) {
+			if (iq->i_ino == ino) {
+				write_unlock(&ih->ih_lock);
+				xfs_idestroy(ip);
+
+				XFS_STATS_INC(xfsstats.xs_ig_dup);
+				goto again;
+			}
+		}
+	}
+
+	/*
+	 * These values _must_ be set before releasing ihlock!
+	 */
+	ip->i_hash = ih;
+	if ((iq = ih->ih_next)) {
+		iq->i_prevp = &ip->i_next;
+	}
+	ip->i_next = iq;
+	ip->i_prevp = &ih->ih_next;
+	ih->ih_next = ip;
+	ip->i_udquot = ip->i_gdquot = NULL;
+	ih->ih_version++;
+
+	write_unlock(&ih->ih_lock);
+
+	/*
+	 * put ip on its cluster's hash chain
+	 */
+	ASSERT(ip->i_chash == NULL && ip->i_cprev == NULL &&
+	       ip->i_cnext == NULL);
+
+	chlnew = NULL;
+	ch = XFS_CHASH(mp, ip->i_blkno);
+ chlredo:
+	s = mutex_spinlock(&ch->ch_lock);
+	for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) {
+		if (chl->chl_blkno == ip->i_blkno) {
+
+			/* insert this inode into the doubly-linked list
+			 * where chl points */
+			if ((iq = chl->chl_ip)) {
+				ip->i_cprev = iq->i_cprev;
+				iq->i_cprev->i_cnext = ip;
+				iq->i_cprev = ip;
+				ip->i_cnext = iq;
+			} else {
+				ip->i_cnext = ip;
+				ip->i_cprev = ip;
+			}
+			chl->chl_ip = ip;
+			ip->i_chash = chl;
+			break;
+		}
+	}
+
+	/* no hash list found for this block; add a new hash list */
+	if (chl == NULL)  {
+		if (chlnew == NULL) {
+			mutex_spinunlock(&ch->ch_lock, s);
+			ASSERT(xfs_chashlist_zone != NULL);
+			chlnew = (xfs_chashlist_t *)
+					kmem_zone_alloc(xfs_chashlist_zone,
+						KM_SLEEP);
+			ASSERT(chlnew != NULL);
+			goto chlredo;
+		} else {
+			ip->i_cnext = ip;
+			ip->i_cprev = ip;
+			ip->i_chash = chlnew;
+			chlnew->chl_ip = ip;
+			chlnew->chl_blkno = ip->i_blkno;
+			chlnew->chl_next = ch->ch_list;
+			ch->ch_list = chlnew;
+			chlnew = NULL;
+		}
+	} else {
+		if (chlnew != NULL) {
+			kmem_zone_free(xfs_chashlist_zone, chlnew);
+		}
+	}
+
+	mutex_spinunlock(&ch->ch_lock, s);
+
+
+	/*
+	 * Link ip to its mount and thread it on the mount's inode list.
+	 */
+	XFS_MOUNT_ILOCK(mp);
+	if ((iq = mp->m_inodes)) {
+		ASSERT(iq->i_mprev->i_mnext == iq);
+		ip->i_mprev = iq->i_mprev;
+		iq->i_mprev->i_mnext = ip;
+		iq->i_mprev = ip;
+		ip->i_mnext = iq;
+	} else {
+		ip->i_mnext = ip;
+		ip->i_mprev = ip;
+	}
+	mp->m_inodes = ip;
+
+	XFS_MOUNT_IUNLOCK(mp);
+
+	newnode = 1;
+
+ return_ip:
+	ASSERT(ip->i_df.if_ext_max ==
+	       XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
+
+	ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
+	       ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0));
+
+	*ipp = ip;
+
+	/* Update the linux inode */
+	error = vn_revalidate(vp, ATTR_COMM|ATTR_LAZY);
+
+	return 0;
+}
+
+
+/*
+ * The 'normal' internal xfs_iget, if needed it will
+ * 'allocate', or 'get', the vnode.
+ */
+int
+xfs_iget(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_ino_t	ino,
+	uint		lock_flags,
+	xfs_inode_t	**ipp,
+	xfs_daddr_t	bno)
+{
+	struct inode	*inode;
+	vnode_t		*vp = NULL;
+	int		error;
+
+retry:
+	XFS_STATS_INC(xfsstats.xs_ig_attempts);
+
+	if ((inode = iget_locked(XFS_MTOVFS(mp)->vfs_super, ino))) {
+		bhv_desc_t	*bdp;
+		xfs_inode_t	*ip;
+		int		newnode;
+
+
+		vp = LINVFS_GET_VP(inode);
+		if (inode->i_state & I_NEW) {
+inode_allocate:
+			XFS_STATS_INC(xfsstats.vn_alloc);
+			vn_initialize(inode);
+			error = xfs_iget_core(vp, mp, tp, ino,
+							lock_flags, ipp, bno);
+			if (error) {
+				make_bad_inode(inode);
+				if (inode->i_state & I_NEW)
+					unlock_new_inode(inode);
+				iput(inode);
+			}
+		} else {
+			/* These are true if the inode is in inactive or
+			 * reclaim. The linux inode is about to go away,
+			 * wait for that path to finish, and try again.
+			 */
+			if (vp->v_flag & (VINACT | VRECLM)) {
+				vn_wait(vp);
+				iput(inode);
+				goto retry;
+			}
+
+			bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops);
+			if (bdp == NULL)
+				goto inode_allocate;
+			ip = XFS_BHVTOI(bdp);
+			if (lock_flags != 0)
+				xfs_ilock(ip, lock_flags);
+			newnode = (ip->i_d.di_mode == 0);
+			if (newnode)
+				xfs_iocore_inode_reinit(ip);
+			vn_revalidate(vp, ATTR_COMM|ATTR_LAZY);
+			XFS_STATS_INC(xfsstats.xs_ig_found);
+			*ipp = ip;
+			error = 0;
+		}
+	} else
+		error = ENOMEM; /* If we got no inode we are out of memory */
+
+	return error;
+}
+
+
+/*
+ * A 'special' interface to xfs_iget, where the
+ * vnode is already allocated.
+ */
+int
+xfs_vn_iget(
+	vfs_t		*vfsp,
+	struct vnode	*vp,
+	xfs_ino_t	ino)
+{
+	xfs_inode_t	*ip;
+	xfs_mount_t	*mp = XFS_BHVTOM(vfsp->vfs_fbhv);
+	int error;
+
+	error = xfs_iget_core(vp, mp, NULL, ino, 0, &ip, 0);
+
+	return error;
+}
+
+
+/*
+ * Do the setup for the various locks within the incore inode.
+ */
+void
+xfs_inode_lock_init(
+	xfs_inode_t	*ip,
+	vnode_t		*vp)
+{
+	mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
+		     "xfsino", (long)vp->v_number);
+	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", vp->v_number);
+#ifdef NOTYET
+	mutex_init(&ip->i_range_lock.r_spinlock, MUTEX_SPIN, "xrange");
+#endif /* NOTYET */
+	init_waitqueue_head(&ip->i_ipin_wait);
+	atomic_set(&ip->i_pincount, 0);
+	init_sema(&ip->i_flock, 1, "xfsfino", vp->v_number);
+}
+
+/*
+ * Look for the inode corresponding to the given ino in the hash table.
+ * If it is there and its i_transp pointer matches tp, return it.
+ * Otherwise, return NULL.
+ */
+xfs_inode_t *
+xfs_inode_incore(xfs_mount_t	*mp,
+		 xfs_ino_t	ino,
+		 xfs_trans_t	*tp)
+{
+	xfs_ihash_t	*ih;
+	xfs_inode_t	*ip;
+
+	ih = XFS_IHASH(mp, ino);
+	read_lock(&ih->ih_lock);
+	for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {
+		if (ip->i_ino == ino) {
+			/*
+			 * If we find it and tp matches, return it.
+			 * Otherwise break from the loop and return
+			 * NULL.
+			 */
+			if (ip->i_transp == tp) {
+				read_unlock(&ih->ih_lock);
+				return (ip);
+			}
+			break;
+		}
+	}	
+	read_unlock(&ih->ih_lock);
+	return (NULL);
+}
+
+/*
+ * Decrement reference count of an inode structure and unlock it.
+ *
+ * ip -- the inode being released
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *	 to be released.  See the comment on xfs_iunlock() for a list
+ *	 of valid values.
+ */
+void
+xfs_iput(xfs_inode_t	*ip,
+	 uint		lock_flags)
+{
+	vnode_t *vp = XFS_ITOV(ip);
+
+	vn_trace_entry(vp, "xfs_iput", (inst_t *)__return_address);
+
+	xfs_iunlock(ip, lock_flags);
+
+	VN_RELE(vp);
+}
+
+/*
+ * Special iput for brand-new inodes that are still locked
+ */
+void
+xfs_iput_new(xfs_inode_t	*ip,
+	     uint		lock_flags)
+{
+	vnode_t		*vp = XFS_ITOV(ip);
+	struct inode	*inode = LINVFS_GET_IP(vp);
+
+	vn_trace_entry(vp, "xfs_iput_new", (inst_t *)__return_address);
+
+	/* We shouldn't get here without this being true, but just in case */
+	if (inode->i_state & I_NEW) {
+		make_bad_inode(inode);
+		unlock_new_inode(inode);
+	}
+	if (lock_flags)
+		xfs_iunlock(ip, lock_flags);
+	VN_RELE(vp);
+}
+
+
+/*
+ * This routine embodies the part of the reclaim code that pulls
+ * the inode from the inode hash table and the mount structure's
+ * inode list.
+ * This should only be called from xfs_reclaim().
+ */
+void
+xfs_ireclaim(xfs_inode_t *ip)
+{
+	vnode_t		*vp;
+
+	/*
+	 * Remove from old hash list and mount list.
+	 */
+	XFS_STATS_INC(xfsstats.xs_ig_reclaims);
+
+	xfs_iextract(ip);
+
+	/*
+	 * Here we do a spurious inode lock in order to coordinate with
+	 * xfs_sync().	This is because xfs_sync() references the inodes
+	 * in the mount list without taking references on the corresponding
+	 * vnodes.  We make that OK here by ensuring that we wait until
+	 * the inode is unlocked in xfs_sync() before we go ahead and
+	 * free it.  We get both the regular lock and the io lock because
+	 * the xfs_sync() code may need to drop the regular one but will
+	 * still hold the io lock.
+	 */
+	xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+	/*
+	 * Release dquots (and their references) if any. An inode may escape
+	 * xfs_inactive and get here via vn_alloc->vn_reclaim path.
+	 */
+	if (ip->i_udquot || ip->i_gdquot) {
+		xfs_qm_dqdettach_inode(ip);
+	}
+
+	/*
+	 * Pull our behavior descriptor from the vnode chain.
+	 */
+	vp = XFS_ITOV_NULL(ip);
+	if (vp) {
+		vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
+	}
+
+	/*
+	 * Free all memory associated with the inode.
+	 */
+	xfs_idestroy(ip);
+}
+
+/*
+ * This routine removes an about-to-be-destroyed inode from
+ * all of the lists in which it is lcoated with the exception
+ * of the behavior chain.  It is used by xfs_ireclaim and
+ * by cxfs relocation cocde, in which case, we are removing
+ * the xfs_inode but leaving the vnode alone since it has
+ * been transformed into a client vnode.
+ */
+void
+xfs_iextract(
+	xfs_inode_t	*ip)
+{
+	xfs_ihash_t	*ih;
+	xfs_inode_t	*iq;
+	xfs_mount_t	*mp;
+	xfs_chash_t	*ch;
+	xfs_chashlist_t *chl, *chm;
+	SPLDECL(s);
+
+	ih = ip->i_hash;
+	write_lock(&ih->ih_lock);
+	if ((iq = ip->i_next)) {
+		iq->i_prevp = ip->i_prevp;
+	}
+	*ip->i_prevp = iq;
+	write_unlock(&ih->ih_lock);
+
+	/*
+	 * Remove from cluster hash list
+	 *   1) delete the chashlist if this is the last inode on the chashlist
+	 *   2) unchain from list of inodes
+	 *   3) point chashlist->chl_ip to 'chl_next' if to this inode.
+	 */
+	mp = ip->i_mount;
+	ch = XFS_CHASH(mp, ip->i_blkno);
+	s = mutex_spinlock(&ch->ch_lock);
+
+	if (ip->i_cnext == ip) {
+		/* Last inode on chashlist */
+		ASSERT(ip->i_cnext == ip && ip->i_cprev == ip);
+		ASSERT(ip->i_chash != NULL);
+		chm=NULL;
+		for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) {
+			if (chl->chl_blkno == ip->i_blkno) {
+				if (chm == NULL) {
+					/* first item on the list */
+					ch->ch_list = chl->chl_next;
+				} else {
+					chm->chl_next = chl->chl_next;
+				}
+				kmem_zone_free(xfs_chashlist_zone, chl);
+				break;
+			} else {
+				ASSERT(chl->chl_ip != ip);
+				chm = chl;
+			}
+		}
+		ASSERT_ALWAYS(chl != NULL);
+       } else {
+		/* delete one inode from a non-empty list */
+		iq = ip->i_cnext;
+		iq->i_cprev = ip->i_cprev;
+		ip->i_cprev->i_cnext = iq;
+		if (ip->i_chash->chl_ip == ip) {
+			ip->i_chash->chl_ip = iq;
+		}
+		ip->i_chash = __return_address;
+		ip->i_cprev = __return_address;
+		ip->i_cnext = __return_address;
+	}
+	mutex_spinunlock(&ch->ch_lock, s);
+
+	/*
+	 * Remove from mount's inode list.
+	 */
+	XFS_MOUNT_ILOCK(mp);
+	ASSERT((ip->i_mnext != NULL) && (ip->i_mprev != NULL));
+	iq = ip->i_mnext;
+	iq->i_mprev = ip->i_mprev;
+	ip->i_mprev->i_mnext = iq;
+
+	/*
+	 * Fix up the head pointer if it points to the inode being deleted.
+	 */
+	if (mp->m_inodes == ip) {
+		if (ip == iq) {
+			mp->m_inodes = NULL;
+		} else {
+			mp->m_inodes = iq;
+		}
+	}
+
+	mp->m_ireclaims++;
+	XFS_MOUNT_IUNLOCK(mp);
+}
+
+/*
+ * This is a wrapper routine around the xfs_ilock() routine
+ * used to centralize some grungy code.	 It is used in places
+ * that wish to lock the inode solely for reading the extents.
+ * The reason these places can't just call xfs_ilock(SHARED)
+ * is that the inode lock also guards to bringing in of the
+ * extents from disk for a file in b-tree format.  If the inode
+ * is in b-tree format, then we need to lock the inode exclusively
+ * until the extents are read in.  Locking it exclusively all
+ * the time would limit our parallelism unnecessarily, though.
+ * What we do instead is check to see if the extents have been
+ * read in yet, and only lock the inode exclusively if they
+ * have not.
+ *
+ * The function returns a value which should be given to the
+ * corresponding xfs_iunlock_map_shared().  This value is
+ * the mode in which the lock was actually taken.
+ */
+uint
+xfs_ilock_map_shared(
+	xfs_inode_t	*ip)
+{
+	uint	lock_mode;
+
+	if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
+	    ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
+		lock_mode = XFS_ILOCK_EXCL;
+	} else {
+		lock_mode = XFS_ILOCK_SHARED;
+	}
+
+	xfs_ilock(ip, lock_mode);
+
+	return lock_mode;
+}
+
+/*
+ * This is simply the unlock routine to go with xfs_ilock_map_shared().
+ * All it does is call xfs_iunlock() with the given lock_mode.
+ */
+void
+xfs_iunlock_map_shared(
+	xfs_inode_t	*ip,
+	unsigned int	lock_mode)
+{
+	xfs_iunlock(ip, lock_mode);
+}
+
+/*
+ * The xfs inode contains 2 locks: a multi-reader lock called the
+ * i_iolock and a multi-reader lock called the i_lock.	This routine
+ * allows either or both of the locks to be obtained.
+ *
+ * The 2 locks should always be ordered so that the IO lock is
+ * obtained first in order to prevent deadlock.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks
+ *	 to be locked.	It can be:
+ *		XFS_IOLOCK_SHARED,
+ *		XFS_IOLOCK_EXCL,
+ *		XFS_ILOCK_SHARED,
+ *		XFS_ILOCK_EXCL,
+ *		XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
+ *		XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
+ *		XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
+ *		XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
+ */
+void
+xfs_ilock(xfs_inode_t	*ip,
+	  uint		lock_flags)
+{
+	/*
+	 * You can't set both SHARED and EXCL for the same lock,
+	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~XFS_LOCK_MASK) == 0);
+
+	if (lock_flags & XFS_IOLOCK_EXCL) {
+		mrupdate(&ip->i_iolock);
+	} else if (lock_flags & XFS_IOLOCK_SHARED) {
+		mraccess(&ip->i_iolock);
+	}
+	if (lock_flags & XFS_ILOCK_EXCL) {
+		mrupdate(&ip->i_lock);
+	} else if (lock_flags & XFS_ILOCK_SHARED) {
+		mraccess(&ip->i_lock);
+	}
+#ifdef XFS_ILOCK_TRACE
+	xfs_ilock_trace(ip, 1, lock_flags, (inst_t *)return_address);
+#endif
+}
+
+/*
+ * This is just like xfs_ilock(), except that the caller
+ * is guaranteed not to sleep.	It returns 1 if it gets
+ * the requested locks and 0 otherwise.	 If the IO lock is
+ * obtained but the inode lock cannot be, then the IO lock
+ * is dropped before returning.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *	 to be locked.	See the comment for xfs_ilock() for a list
+ *	 of valid values.
+ *
+ */
+int
+xfs_ilock_nowait(xfs_inode_t	*ip,
+		 uint		lock_flags)
+{
+	int	iolocked;
+	int	ilocked;
+
+	/*
+	 * You can't set both SHARED and EXCL for the same lock,
+	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~XFS_LOCK_MASK) == 0);
+
+	iolocked = 0;
+	if (lock_flags & XFS_IOLOCK_EXCL) {
+		iolocked = mrtryupdate(&ip->i_iolock);
+		if (!iolocked) {
+			return 0;
+		}
+	} else if (lock_flags & XFS_IOLOCK_SHARED) {
+		iolocked = mrtryaccess(&ip->i_iolock);
+		if (!iolocked) {
+			return 0;
+		}
+	}
+	if (lock_flags & XFS_ILOCK_EXCL) {
+		ilocked = mrtryupdate(&ip->i_lock);
+		if (!ilocked) {
+			if (iolocked) {
+				mrunlock(&ip->i_iolock);
+			}
+			return 0;
+		}
+	} else if (lock_flags & XFS_ILOCK_SHARED) {
+		ilocked = mrtryaccess(&ip->i_lock);
+		if (!ilocked) {
+			if (iolocked) {
+				mrunlock(&ip->i_iolock);
+			}
+			return 0;
+		}
+	}
+#ifdef XFS_ILOCK_TRACE
+	xfs_ilock_trace(ip, 2, lock_flags, (inst_t *)__return_address);
+#endif
+	return 1;
+}
+
+/*
+ * xfs_iunlock() is used to drop the inode locks acquired with
+ * xfs_ilock() and xfs_ilock_nowait().	The caller must pass
+ * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
+ * that we know which locks to drop.
+ *
+ * ip -- the inode being unlocked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *	 to be unlocked.  See the comment for xfs_ilock() for a list
+ *	 of valid values for this parameter.
+ *
+ */
+void
+xfs_iunlock(xfs_inode_t *ip,
+	    uint	lock_flags)
+{
+	/*
+	 * You can't set both SHARED and EXCL for the same lock,
+	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY)) == 0);
+	ASSERT(lock_flags != 0);
+
+	if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) {
+		ASSERT(!(lock_flags & XFS_IOLOCK_SHARED) ||
+		       (ismrlocked(&ip->i_iolock, MR_ACCESS)));
+		ASSERT(!(lock_flags & XFS_IOLOCK_EXCL) ||
+		       (ismrlocked(&ip->i_iolock, MR_UPDATE)));
+		mrunlock(&ip->i_iolock);
+	}
+
+	if (lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) {
+		ASSERT(!(lock_flags & XFS_ILOCK_SHARED) ||
+		       (ismrlocked(&ip->i_lock, MR_ACCESS)));
+		ASSERT(!(lock_flags & XFS_ILOCK_EXCL) ||
+		       (ismrlocked(&ip->i_lock, MR_UPDATE)));
+		mrunlock(&ip->i_lock);
+
+		/*
+		 * Let the AIL know that this item has been unlocked in case
+		 * it is in the AIL and anyone is waiting on it.  Don't do
+		 * this if the caller has asked us not to.
+		 */
+		if (!(lock_flags & XFS_IUNLOCK_NONOTIFY) &&
+		     ip->i_itemp != NULL) {
+			xfs_trans_unlocked_item(ip->i_mount,
+						(xfs_log_item_t*)(ip->i_itemp));
+		}
+	}
+#ifdef XFS_ILOCK_TRACE
+	xfs_ilock_trace(ip, 3, lock_flags, (inst_t *)__return_address);
+#endif
+}
+
+/*
+ * give up write locks.	 the i/o lock cannot be held nested
+ * if it is being demoted.
+ */
+void
+xfs_ilock_demote(xfs_inode_t	*ip,
+		 uint		lock_flags)
+{
+	ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
+
+	if (lock_flags & XFS_ILOCK_EXCL) {
+		ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+		mrdemote(&ip->i_lock);
+	}
+	if (lock_flags & XFS_IOLOCK_EXCL) {
+		ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
+		mrdemote(&ip->i_iolock);
+	}
+}
+
+/*
+ * The following three routines simply manage the i_flock
+ * semaphore embedded in the inode.  This semaphore synchronizes
+ * processes attempting to flush the in-core inode back to disk.
+ */
+void
+xfs_iflock(xfs_inode_t *ip)
+{
+	psema(&(ip->i_flock), PINOD|PLTWAIT);
+}
+
+int
+xfs_iflock_nowait(xfs_inode_t *ip)
+{
+	return (cpsema(&(ip->i_flock)));
+}
+
+void
+xfs_ifunlock(xfs_inode_t *ip)
+{
+	ASSERT(valusema(&(ip->i_flock)) <= 0);
+	vsema(&(ip->i_flock));
+}
diff --git a/fs/xfs/xfs_imap.h b/fs/xfs/xfs_imap.h
new file mode 100644
index 000000000000..40cbb47e7069
--- /dev/null
+++ b/fs/xfs/xfs_imap.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_IMAP_H__
+#define __XFS_IMAP_H__
+
+/*
+ * This is the structure passed to xfs_imap() to map
+ * an inode number to its on disk location.
+ */
+typedef struct xfs_imap {
+	xfs_daddr_t	im_blkno;	/* starting BB of inode chunk */
+	uint		im_len;		/* length in BBs of inode chunk */
+	xfs_agblock_t	im_agblkno;	/* logical block of inode chunk in ag */
+	ushort		im_ioffset;	/* inode offset in block in "inodes" */
+	ushort		im_boffset;	/* inode offset in block in bytes */
+} xfs_imap_t;
+
+#ifdef __KERNEL__
+struct xfs_mount;
+struct xfs_trans;
+int	xfs_imap(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
+		 xfs_imap_t *, uint);
+#endif
+
+#endif	/* __XFS_IMAP_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
new file mode 100644
index 000000000000..f7c35e754899
--- /dev/null
+++ b/fs/xfs/xfs_inode.c
@@ -0,0 +1,3626 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+kmem_zone_t *xfs_ifork_zone;
+kmem_zone_t *xfs_inode_zone;
+kmem_zone_t *xfs_chashlist_zone;
+
+/*
+ * Used in xfs_itruncate().  This is the maximum number of extents
+ * freed from a file in a single transaction.
+ */
+#define XFS_ITRUNC_MAX_EXTENTS	2
+
+STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
+STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
+STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
+STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
+
+
+#ifdef DEBUG
+/*
+ * Make sure that the extents in the given memory buffer
+ * are valid.
+ */
+STATIC void
+xfs_validate_extents(
+	xfs_bmbt_rec_32_t	*ep,
+	int			nrecs,
+	xfs_exntfmt_t		fmt)
+{
+	xfs_bmbt_irec_t		irec;
+	int			i;
+	xfs_bmbt_rec_t		rec;
+
+	for (i = 0; i < nrecs; i++) {
+		bcopy(ep, &rec, sizeof(rec));
+		xfs_bmbt_get_all(&rec, &irec);
+		if (fmt == XFS_EXTFMT_NOSTATE)
+			ASSERT(irec.br_state == XFS_EXT_NORM);
+		ep++;
+	}
+}
+#else /* DEBUG */
+#define xfs_validate_extents(ep, nrecs, fmt)
+#endif /* DEBUG */
+
+/*
+ * Check that none of the inode's in the buffer have a next
+ * unlinked field of 0.
+ */
+#if defined(DEBUG)
+void
+xfs_inobp_check(
+	xfs_mount_t	*mp,
+	xfs_buf_t	*bp)
+{
+	int		i;
+	int		j;
+	xfs_dinode_t	*dip;
+
+	j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
+
+	for (i = 0; i < j; i++) {
+		dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+					i * mp->m_sb.sb_inodesize);
+		if (INT_ISZERO(dip->di_next_unlinked, ARCH_CONVERT))  {
+			xfs_fs_cmn_err(CE_ALERT, mp,
+				"Detected a bogus zero next_unlinked field in incore inode buffer 0x%p.	 About to pop an ASSERT.",
+				bp);
+			ASSERT(!INT_ISZERO(dip->di_next_unlinked, ARCH_CONVERT));
+		}
+	}
+}
+#endif
+
+/*
+ * called from bwrite on xfs inode buffers
+ */
+void
+xfs_inobp_bwcheck(xfs_buf_t *bp)
+{
+	xfs_mount_t	*mp;
+	int		i;
+	int		j;
+	xfs_dinode_t	*dip;
+
+	ASSERT(XFS_BUF_FSPRIVATE3(bp, void *) != NULL);
+
+	mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
+
+
+	j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
+
+	for (i = 0; i < j; i++)	 {
+		dip = (xfs_dinode_t *) xfs_buf_offset(bp,
+						i * mp->m_sb.sb_inodesize);
+		if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC) {
+			cmn_err(CE_WARN,
+"Bad magic # 0x%x in XFS inode buffer 0x%Lx, starting blockno %Ld, offset 0x%x",
+				INT_GET(dip->di_core.di_magic, ARCH_CONVERT),
+				(__uint64_t)(__psunsigned_t) bp,
+				(__int64_t) XFS_BUF_ADDR(bp),
+				xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
+			xfs_fs_cmn_err(CE_WARN, mp,
+				"corrupt, unmount and run xfs_repair");
+		}
+		if (INT_ISZERO(dip->di_next_unlinked, ARCH_CONVERT))  {
+			cmn_err(CE_WARN,
+"Bad next_unlinked field (0) in XFS inode buffer 0x%x, starting blockno %Ld, offset 0x%x",
+				(__uint64_t)(__psunsigned_t) bp,
+				(__int64_t) XFS_BUF_ADDR(bp),
+				xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
+			xfs_fs_cmn_err(CE_WARN, mp,
+				"corrupt, unmount and run xfs_repair");
+		}
+	}
+
+	return;
+}
+
+/*
+ * This routine is called to map an inode number within a file
+ * system to the buffer containing the on-disk version of the
+ * inode.  It returns a pointer to the buffer containing the
+ * on-disk inode in the bpp parameter, and in the dip parameter
+ * it returns a pointer to the on-disk inode within that buffer.
+ *
+ * If a non-zero error is returned, then the contents of bpp and
+ * dipp are undefined.
+ *
+ * Use xfs_imap() to determine the size and location of the
+ * buffer to read from disk.
+ */
+int
+xfs_inotobp(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_ino_t	ino,
+	xfs_dinode_t	**dipp,
+	xfs_buf_t	**bpp,
+	int		*offset)
+{
+	int		di_ok;
+	xfs_imap_t	imap;
+	xfs_buf_t	*bp;
+	int		error;
+	xfs_dinode_t	*dip;
+
+	/*
+	 * Call the space managment code to find the location of the
+	 * inode on disk.
+	 */
+	imap.im_blkno = 0;
+	error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP);
+	if (error != 0) {
+		cmn_err(CE_WARN,
+	"xfs_inotobp: xfs_imap()  returned an "
+	"error %d on %s.  Returning error.", error, mp->m_fsname);
+		return error;
+	}
+
+	/*
+	 * If the inode number maps to a block outside the bounds of the
+	 * file system then return NULL rather than calling read_buf
+	 * and panicing when we get an error from the driver.
+	 */
+	if ((imap.im_blkno + imap.im_len) >
+	    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+		cmn_err(CE_WARN,
+	"xfs_inotobp: inode number (%d + %d) maps to a block outside the bounds "
+	"of the file system %s.	 Returning EINVAL.",
+			imap.im_blkno, imap.im_len,mp->m_fsname);
+		return XFS_ERROR(EINVAL);
+	}
+
+	/*
+	 * Read in the buffer.	If tp is NULL, xfs_trans_read_buf() will
+	 * default to just a read_buf() call.
+	 */
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
+				   (int)imap.im_len, XFS_BUF_LOCK, &bp);
+
+	if (error) {
+		cmn_err(CE_WARN,
+	"xfs_inotobp: xfs_trans_read_buf()  returned an "
+	"error %d on %s.  Returning error.", error, mp->m_fsname);
+		return error;
+	}
+	dip = (xfs_dinode_t *)xfs_buf_offset(bp, 0);
+	di_ok =
+		INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC &&
+		XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT));
+	if (XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
+			XFS_RANDOM_ITOBP_INOTOBP)) {
+		xfs_trans_brelse(tp, bp);
+		cmn_err(CE_WARN,
+	"xfs_inotobp: XFS_TEST_ERROR()	returned an "
+	"error on %s.  Returning EFSCORRUPTED.",  mp->m_fsname);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	xfs_inobp_check(mp, bp);
+
+	/*
+	 * Set *dipp to point to the on-disk inode in the buffer.
+	 */
+	*dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
+	*bpp = bp;
+	*offset = imap.im_boffset;
+	return 0;
+}
+
+
+/*
+ * This routine is called to map an inode to the buffer containing
+ * the on-disk version of the inode.  It returns a pointer to the
+ * buffer containing the on-disk inode in the bpp parameter, and in
+ * the dip parameter it returns a pointer to the on-disk inode within
+ * that buffer.
+ *
+ * If a non-zero error is returned, then the contents of bpp and
+ * dipp are undefined.
+ *
+ * If the inode is new and has not yet been initialized, use xfs_imap()
+ * to determine the size and location of the buffer to read from disk.
+ * If the inode has already been mapped to its buffer and read in once,
+ * then use the mapping information stored in the inode rather than
+ * calling xfs_imap().	This allows us to avoid the overhead of looking
+ * at the inode btree for small block file systems (see xfs_dilocate()).
+ * We can tell whether the inode has been mapped in before by comparing
+ * its disk block address to 0.	 Only uninitialized inodes will have
+ * 0 for the disk block address.
+ */
+int
+xfs_itobp(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	xfs_dinode_t	**dipp,
+	xfs_buf_t	**bpp,
+	xfs_daddr_t	bno)
+{
+	xfs_buf_t	*bp;
+	int		error;
+	xfs_imap_t	imap;
+#ifdef __KERNEL__
+	int		i;
+	int		ni;
+#endif
+
+	if (ip->i_blkno == (xfs_daddr_t)0) {
+		/*
+		 * Call the space management code to find the location of the
+		 * inode on disk.
+		 */
+		imap.im_blkno = bno;
+		error = xfs_imap(mp, tp, ip->i_ino, &imap, XFS_IMAP_LOOKUP);
+		if (error != 0) {
+			return error;
+		}
+
+		/*
+		 * If the inode number maps to a block outside the bounds
+		 * of the file system then return NULL rather than calling
+		 * read_buf and panicing when we get an error from the
+		 * driver.
+		 */
+		if ((imap.im_blkno + imap.im_len) >
+		    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+#ifdef DEBUG
+			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
+					"(imap.im_blkno (0x%llx) "
+					"+ imap.im_len (0x%llx)) > "
+					" XFS_FSB_TO_BB(mp, "
+					"mp->m_sb.sb_dblocks) (0x%llx)",
+					(unsigned long long) imap.im_blkno,
+					(unsigned long long) imap.im_len,
+					XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+#endif /* DEBUG */
+			return XFS_ERROR(EINVAL);
+		}
+
+		/*
+		 * Fill in the fields in the inode that will be used to
+		 * map the inode to its buffer from now on.
+		 */
+		ip->i_blkno = imap.im_blkno;
+		ip->i_len = imap.im_len;
+		ip->i_boffset = imap.im_boffset;
+	} else {
+		/*
+		 * We've already mapped the inode once, so just use the
+		 * mapping that we saved the first time.
+		 */
+		imap.im_blkno = ip->i_blkno;
+		imap.im_len = ip->i_len;
+		imap.im_boffset = ip->i_boffset;
+	}
+	ASSERT(bno == 0 || bno == imap.im_blkno);
+
+	/*
+	 * Read in the buffer.	If tp is NULL, xfs_trans_read_buf() will
+	 * default to just a read_buf() call.
+	 */
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
+				   (int)imap.im_len, XFS_BUF_LOCK, &bp);
+
+	if (error) {
+#ifdef DEBUG
+		xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
+				"xfs_trans_read_buf() returned error %d, "
+				"imap.im_blkno 0x%llx, imap.im_len 0x%llx",
+				error, (unsigned long long) imap.im_blkno,
+				(unsigned long long) imap.im_len);
+#endif /* DEBUG */
+		return error;
+	}
+#ifdef __KERNEL__
+	/*
+	 * Validate the magic number and version of every inode in the buffer
+	 * (if DEBUG kernel) or the first inode in the buffer, otherwise.
+	 */
+#ifdef DEBUG
+	ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog;
+#else
+	ni = 1;
+#endif
+	for (i = 0; i < ni; i++) {
+		int		di_ok;
+		xfs_dinode_t	*dip;
+
+		dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+					(i << mp->m_sb.sb_inodelog));
+		di_ok = INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC &&
+			    XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT));
+		if (XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
+				 XFS_RANDOM_ITOBP_INOTOBP)) {
+#ifdef DEBUG
+			prdev("bad inode magic/vsn daddr 0x%llx #%d (magic=%x)",
+				mp->m_dev, (unsigned long long)imap.im_blkno, i,
+				INT_GET(dip->di_core.di_magic, ARCH_CONVERT));
+#endif
+			xfs_trans_brelse(tp, bp);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+	}
+#endif	/* __KERNEL__ */
+
+	xfs_inobp_check(mp, bp);
+
+	/*
+	 * Mark the buffer as an inode buffer now that it looks good
+	 */
+	XFS_BUF_SET_VTYPE(bp, B_FS_INO);
+
+	/*
+	 * Set *dipp to point to the on-disk inode in the buffer.
+	 */
+	*dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
+	*bpp = bp;
+	return 0;
+}
+
+/*
+ * Move inode type and inode format specific information from the
+ * on-disk inode to the in-core inode.	For fifos, devs, and sockets
+ * this means set if_rdev to the proper value.	For files, directories,
+ * and symlinks this means to bring in the in-line data or extent
+ * pointers.  For a file in B-tree format, only the root is immediately
+ * brought in-core.  The rest will be in-lined in if_extents when it
+ * is first referenced (see xfs_iread_extents()).
+ */
+STATIC int
+xfs_iformat(
+	xfs_inode_t		*ip,
+	xfs_dinode_t		*dip)
+{
+	xfs_attr_shortform_t	*atp;
+	int			size;
+	int			error;
+	xfs_fsize_t		di_size;
+	ip->i_df.if_ext_max =
+		XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+	error = 0;
+
+	if (INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) +
+		INT_GET(dip->di_core.di_anextents, ARCH_CONVERT) >
+	    INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT)) {
+		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+			"corrupt dinode %Lu, extent total = %d, nblocks = %Lu."
+			"  Unmount and run xfs_repair.",
+			(unsigned long long)ip->i_ino,
+			(int)(INT_GET(dip->di_core.di_nextents, ARCH_CONVERT)
+			    + INT_GET(dip->di_core.di_anextents, ARCH_CONVERT)),
+			(unsigned long long)
+			INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT));
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	if (INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT) > ip->i_mount->m_sb.sb_inodesize) {
+		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+			"corrupt dinode %Lu, forkoff = 0x%x."
+			"  Unmount and run xfs_repair.",
+			(unsigned long long)ip->i_ino,
+			(int)(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT)));
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	switch (ip->i_d.di_mode & IFMT) {
+	case IFIFO:
+	case IFCHR:
+	case IFBLK:
+	case IFSOCK:
+		if (INT_GET(dip->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_DEV)
+			return XFS_ERROR(EFSCORRUPTED);
+		ip->i_d.di_size = 0;
+		ip->i_df.if_u2.if_rdev = INT_GET(dip->di_u.di_dev, ARCH_CONVERT);
+		break;
+
+	case IFREG:
+	case IFLNK:
+	case IFDIR:
+		switch (INT_GET(dip->di_core.di_format, ARCH_CONVERT)) {
+		case XFS_DINODE_FMT_LOCAL:
+			/*
+			 * no local regular files yet
+			 */
+			if ((INT_GET(dip->di_core.di_mode, ARCH_CONVERT) & IFMT) == IFREG) {
+				xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+					"corrupt inode "
+					"(local format for regular file) %Lu.  "
+					"Unmount and run xfs_repair.",
+					(unsigned long long) ip->i_ino);
+				return XFS_ERROR(EFSCORRUPTED);
+			}
+
+			di_size = INT_GET(dip->di_core.di_size, ARCH_CONVERT);
+			if (di_size >
+			    XFS_DFORK_DSIZE_ARCH(dip, ip->i_mount, ARCH_CONVERT)) {
+				xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+					"corrupt inode %Lu "
+					"(bad size %Ld for local inode).  "
+					"Unmount and run xfs_repair.",
+					(unsigned long long) ip->i_ino,
+					(long long) di_size);
+				return XFS_ERROR(EFSCORRUPTED);
+			}
+
+			size = (int)di_size;
+			error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
+			break;
+		case XFS_DINODE_FMT_EXTENTS:
+			error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
+			break;
+		case XFS_DINODE_FMT_BTREE:
+			error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
+			break;
+		default:
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+		break;
+
+	default:
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	if (error) {
+		return error;
+	}
+	if (!XFS_DFORK_Q_ARCH(dip, ARCH_CONVERT))
+		return 0;
+	ASSERT(ip->i_afp == NULL);
+	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
+	ip->i_afp->if_ext_max =
+		XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+	switch (INT_GET(dip->di_core.di_aformat, ARCH_CONVERT)) {
+	case XFS_DINODE_FMT_LOCAL:
+		atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR_ARCH(dip, ARCH_CONVERT);
+		size = (int)INT_GET(atp->hdr.totsize, ARCH_CONVERT);
+		error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+		error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
+		break;
+	case XFS_DINODE_FMT_BTREE:
+		error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
+		break;
+	default:
+		error = XFS_ERROR(EFSCORRUPTED);
+		break;
+	}
+	if (error) {
+		kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+		ip->i_afp = NULL;
+		xfs_idestroy_fork(ip, XFS_DATA_FORK);
+	}
+	return error;
+}
+
+/*
+ * The file is in-lined in the on-disk inode.
+ * If it fits into if_inline_data, then copy
+ * it there, otherwise allocate a buffer for it
+ * and copy the data there.  Either way, set
+ * if_data to point at the data.
+ * If we allocate a buffer for the data, make
+ * sure that its size is a multiple of 4 and
+ * record the real size in i_real_bytes.
+ */
+STATIC int
+xfs_iformat_local(
+	xfs_inode_t	*ip,
+	xfs_dinode_t	*dip,
+	int		whichfork,
+	int		size)
+{
+	xfs_ifork_t	*ifp;
+	int		real_size;
+
+	/*
+	 * If the size is unreasonable, then something
+	 * is wrong and we just bail out rather than crash in
+	 * kmem_alloc() or bcopy() below.
+	 */
+	if (size > XFS_DFORK_SIZE_ARCH(dip, ip->i_mount, whichfork, ARCH_CONVERT)) {
+		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+			"corrupt inode %Lu "
+			"(bad size %d for local fork, size = %d).  "
+			"Unmount and run xfs_repair.",
+			(unsigned long long) ip->i_ino, size,
+			XFS_DFORK_SIZE_ARCH(dip, ip->i_mount, whichfork, ARCH_CONVERT));
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	real_size = 0;
+	if (size == 0)
+		ifp->if_u1.if_data = NULL;
+	else if (size <= sizeof(ifp->if_u2.if_inline_data))
+		ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+	else {
+		real_size = roundup(size, 4);
+		ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
+	}
+	ifp->if_bytes = size;
+	ifp->if_real_bytes = real_size;
+	if (size)
+		bcopy(XFS_DFORK_PTR_ARCH(dip, whichfork, ARCH_CONVERT), ifp->if_u1.if_data, size);
+	ifp->if_flags &= ~XFS_IFEXTENTS;
+	ifp->if_flags |= XFS_IFINLINE;
+	return 0;
+}
+
+/*
+ * The file consists of a set of extents all
+ * of which fit into the on-disk inode.
+ * If there are few enough extents to fit into
+ * the if_inline_ext, then copy them there.
+ * Otherwise allocate a buffer for them and copy
+ * them into it.  Either way, set if_extents
+ * to point at the extents.
+ */
+STATIC int
+xfs_iformat_extents(
+	xfs_inode_t	*ip,
+	xfs_dinode_t	*dip,
+	int		whichfork)
+{
+	xfs_ifork_t	*ifp;
+	int		nex;
+	int		real_size;
+	int		size;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	nex = XFS_DFORK_NEXTENTS_ARCH(dip, whichfork, ARCH_CONVERT);
+	size = nex * (uint)sizeof(xfs_bmbt_rec_t);
+
+	/*
+	 * If the number of extents is unreasonable, then something
+	 * is wrong and we just bail out rather than crash in
+	 * kmem_alloc() or bcopy() below.
+	 */
+	if (size < 0 || size > XFS_DFORK_SIZE_ARCH(dip, ip->i_mount, whichfork, ARCH_CONVERT)) {
+		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+			"corrupt inode %Lu ((a)extents = %d).  "
+			"Unmount and run xfs_repair.",
+			(unsigned long long) ip->i_ino, nex);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	real_size = 0;
+	if (nex == 0)
+		ifp->if_u1.if_extents = NULL;
+	else if (nex <= XFS_INLINE_EXTS)
+		ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+	else {
+		ifp->if_u1.if_extents = kmem_alloc(size, KM_SLEEP);
+		ASSERT(ifp->if_u1.if_extents != NULL);
+		real_size = size;
+	}
+	ifp->if_bytes = size;
+	ifp->if_real_bytes = real_size;
+	if (size) {
+		xfs_validate_extents(
+			(xfs_bmbt_rec_32_t *)XFS_DFORK_PTR_ARCH(dip, whichfork, ARCH_CONVERT),
+			nex, XFS_EXTFMT_INODE(ip));
+		bcopy(XFS_DFORK_PTR_ARCH(dip, whichfork, ARCH_CONVERT), ifp->if_u1.if_extents,
+		      size);
+		xfs_bmap_trace_exlist("xfs_iformat_extents", ip, nex,
+			whichfork);
+		if (whichfork != XFS_DATA_FORK ||
+			XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
+				if (xfs_check_nostate_extents(
+				    ifp->if_u1.if_extents, nex))
+					return XFS_ERROR(EFSCORRUPTED);
+	}
+	ifp->if_flags |= XFS_IFEXTENTS;
+	return 0;
+}
+
+/*
+ * The file has too many extents to fit into
+ * the inode, so they are in B-tree format.
+ * Allocate a buffer for the root of the B-tree
+ * and copy the root into it.  The i_extents
+ * field will remain NULL until all of the
+ * extents are read in (when they are needed).
+ */
+STATIC int
+xfs_iformat_btree(
+	xfs_inode_t		*ip,
+	xfs_dinode_t		*dip,
+	int			whichfork)
+{
+	xfs_bmdr_block_t	*dfp;
+	xfs_ifork_t		*ifp;
+	/* REFERENCED */
+	int			nrecs;
+	int			size;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR_ARCH(dip, whichfork, ARCH_CONVERT);
+	size = XFS_BMAP_BROOT_SPACE(dfp);
+	nrecs = XFS_BMAP_BROOT_NUMRECS(dfp);
+
+	/*
+	 * blow out if -- fork has less extents than can fit in
+	 * fork (fork shouldn't be a btree format), root btree
+	 * block has more records than can fit into the fork,
+	 * or the number of extents is greater than the number of
+	 * blocks.
+	 */
+	if (XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max
+	    || XFS_BMDR_SPACE_CALC(nrecs) >
+			XFS_DFORK_SIZE_ARCH(dip, ip->i_mount, whichfork, ARCH_CONVERT)
+	    || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks) {
+		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+			"corrupt inode %Lu (btree).  "
+			"Unmount and run xfs_repair.",
+			(unsigned long long) ip->i_ino);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	ifp->if_broot_bytes = size;
+	ifp->if_broot = kmem_alloc(size, KM_SLEEP);
+	ASSERT(ifp->if_broot != NULL);
+	/*
+	 * Copy and convert from the on-disk structure
+	 * to the in-memory structure.
+	 */
+	xfs_bmdr_to_bmbt(dfp, XFS_DFORK_SIZE_ARCH(dip, ip->i_mount, whichfork, ARCH_CONVERT),
+		ifp->if_broot, size);
+	ifp->if_flags &= ~XFS_IFEXTENTS;
+	ifp->if_flags |= XFS_IFBROOT;
+
+	return 0;
+}
+
+/*
+ * xfs_xlate_dinode_core - translate an xfs_inode_core_t between ondisk
+ * and native format
+ *
+ * buf	= on-disk representation
+ * dip	= native representation
+ * dir	= direction - +ve -> disk to native
+ *		      -ve -> native to disk
+ * arch = on-disk architecture
+ */
+
+void
+xfs_xlate_dinode_core(xfs_caddr_t buf, xfs_dinode_core_t *dip,
+    int dir, xfs_arch_t arch)
+{
+    xfs_dinode_core_t	*buf_core;
+    xfs_dinode_core_t	*mem_core;
+
+    ASSERT(dir);
+
+    buf_core=(xfs_dinode_core_t*)buf;
+    mem_core=(xfs_dinode_core_t*)dip;
+
+    if (arch == ARCH_NOCONVERT) {
+	if (dir>0) {
+	    bcopy((xfs_caddr_t)buf_core, (xfs_caddr_t)mem_core, sizeof(xfs_dinode_core_t));
+	} else {
+	    bcopy((xfs_caddr_t)mem_core, (xfs_caddr_t)buf_core, sizeof(xfs_dinode_core_t));
+	}
+	return;
+    }
+
+    INT_XLATE(buf_core->di_magic,	mem_core->di_magic,	   dir, arch);
+    INT_XLATE(buf_core->di_mode,	mem_core->di_mode,	   dir, arch);
+    INT_XLATE(buf_core->di_version,	mem_core->di_version,	   dir, arch);
+    INT_XLATE(buf_core->di_format,	mem_core->di_format,	   dir, arch);
+    INT_XLATE(buf_core->di_onlink,	mem_core->di_onlink,	   dir, arch);
+    INT_XLATE(buf_core->di_uid,		mem_core->di_uid,	   dir, arch);
+    INT_XLATE(buf_core->di_gid,		mem_core->di_gid,	   dir, arch);
+    INT_XLATE(buf_core->di_nlink,	mem_core->di_nlink,	   dir, arch);
+    INT_XLATE(buf_core->di_projid,	mem_core->di_projid,	   dir, arch);
+
+    if (dir>0) {
+	bcopy(buf_core->di_pad, mem_core->di_pad, sizeof(buf_core->di_pad));
+    } else {
+	bcopy(mem_core->di_pad, buf_core->di_pad, sizeof(buf_core->di_pad));
+    }
+
+    INT_XLATE(buf_core->di_atime.t_sec, mem_core->di_atime.t_sec,  dir, arch);
+    INT_XLATE(buf_core->di_atime.t_nsec,mem_core->di_atime.t_nsec, dir, arch);
+
+    INT_XLATE(buf_core->di_mtime.t_sec, mem_core->di_mtime.t_sec,  dir, arch);
+    INT_XLATE(buf_core->di_mtime.t_nsec,mem_core->di_mtime.t_nsec, dir, arch);
+
+    INT_XLATE(buf_core->di_ctime.t_sec, mem_core->di_ctime.t_sec,  dir, arch);
+    INT_XLATE(buf_core->di_ctime.t_nsec,mem_core->di_ctime.t_nsec, dir, arch);
+
+    INT_XLATE(buf_core->di_size,	mem_core->di_size,	   dir, arch);
+    INT_XLATE(buf_core->di_nblocks,	mem_core->di_nblocks,	   dir, arch);
+    INT_XLATE(buf_core->di_extsize,	mem_core->di_extsize,	   dir, arch);
+
+    INT_XLATE(buf_core->di_nextents,	mem_core->di_nextents,	   dir, arch);
+    INT_XLATE(buf_core->di_anextents,	mem_core->di_anextents,	   dir, arch);
+    INT_XLATE(buf_core->di_forkoff,	mem_core->di_forkoff,	   dir, arch);
+    INT_XLATE(buf_core->di_aformat,	mem_core->di_aformat,	   dir, arch);
+    INT_XLATE(buf_core->di_dmevmask,	mem_core->di_dmevmask,	   dir, arch);
+    INT_XLATE(buf_core->di_dmstate,	mem_core->di_dmstate,	   dir, arch);
+    INT_XLATE(buf_core->di_flags,	mem_core->di_flags,	   dir, arch);
+    INT_XLATE(buf_core->di_gen,		mem_core->di_gen,	   dir, arch);
+
+}
+
+/*
+ * Given a mount structure and an inode number, return a pointer
+ * to a newly allocated in-core inode coresponding to the given
+ * inode number.
+ *
+ * Initialize the inode's attributes and extent pointers if it
+ * already has them (it will not if the inode has no links).
+ */
+int
+xfs_iread(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_ino_t	ino,
+	xfs_inode_t	**ipp,
+	xfs_daddr_t	bno)
+{
+	xfs_buf_t	*bp;
+	xfs_dinode_t	*dip;
+	xfs_inode_t	*ip;
+	int		error;
+
+	ASSERT(xfs_inode_zone != NULL);
+
+	ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP);
+	ip->i_ino = ino;
+	ip->i_mount = mp;
+
+	/*
+	 * Get pointer's to the on-disk inode and the buffer containing it.
+	 * If the inode number refers to a block outside the file system
+	 * then xfs_itobp() will return NULL.  In this case we should
+	 * return NULL as well.	 Set i_blkno to 0 so that xfs_itobp() will
+	 * know that this is a new incore inode.
+	 */
+	error = xfs_itobp(mp, tp, ip, &dip, &bp, bno);
+
+	if (error != 0) {
+		kmem_zone_free(xfs_inode_zone, ip);
+		return error;
+	}
+
+	/*
+	 * Initialize inode's trace buffers.
+	 * Do this before xfs_iformat in case it adds entries.
+	 */
+#ifdef XFS_BMAP_TRACE
+	ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_BMBT_TRACE
+	ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_RW_TRACE
+	ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_STRAT_TRACE
+	ip->i_strat_trace = ktrace_alloc(XFS_STRAT_KTRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_ILOCK_TRACE
+	ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_DIR2_TRACE
+	ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_SLEEP);
+#endif
+
+	/*
+	 * If we got something that isn't an inode it means someone
+	 * (nfs or dmi) has a stale handle.
+	 */
+	if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC) {
+		kmem_zone_free(xfs_inode_zone, ip);
+		xfs_trans_brelse(tp, bp);
+#ifdef DEBUG
+		xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
+				"dip->di_core.di_magic (0x%x) != "
+				"XFS_DINODE_MAGIC (0x%x)",
+				INT_GET(dip->di_core.di_magic, ARCH_CONVERT),
+				XFS_DINODE_MAGIC);
+#endif /* DEBUG */
+		return XFS_ERROR(EINVAL);
+	}
+
+	/*
+	 * If the on-disk inode is already linked to a directory
+	 * entry, copy all of the inode into the in-core inode.
+	 * xfs_iformat() handles copying in the inode format
+	 * specific information.
+	 * Otherwise, just get the truly permanent information.
+	 */
+	if (!INT_ISZERO(dip->di_core.di_mode, ARCH_CONVERT)) {
+		xfs_xlate_dinode_core((xfs_caddr_t)&dip->di_core,
+		     &(ip->i_d), 1, ARCH_CONVERT);
+		error = xfs_iformat(ip, dip);
+		if (error)  {
+			kmem_zone_free(xfs_inode_zone, ip);
+			xfs_trans_brelse(tp, bp);
+#ifdef DEBUG
+			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
+					"xfs_iformat() returned error %d",
+					error);
+#endif /* DEBUG */
+			return error;
+		}
+	} else {
+		ip->i_d.di_magic = INT_GET(dip->di_core.di_magic, ARCH_CONVERT);
+		ip->i_d.di_version = INT_GET(dip->di_core.di_version, ARCH_CONVERT);
+		ip->i_d.di_gen = INT_GET(dip->di_core.di_gen, ARCH_CONVERT);
+		/*
+		 * Make sure to pull in the mode here as well in
+		 * case the inode is released without being used.
+		 * This ensures that xfs_inactive() will see that
+		 * the inode is already free and not try to mess
+		 * with the uninitialized part of it.
+		 */
+		ip->i_d.di_mode = 0;
+		/*
+		 * Initialize the per-fork minima and maxima for a new
+		 * inode here.	xfs_iformat will do it for old inodes.
+		 */
+		ip->i_df.if_ext_max =
+			XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+	}
+
+	/*
+	 * The inode format changed when we moved the link count and
+	 * made it 32 bits long.  If this is an old format inode,
+	 * convert it in memory to look like a new one.	 If it gets
+	 * flushed to disk we will convert back before flushing or
+	 * logging it.	We zero out the new projid field and the old link
+	 * count field.	 We'll handle clearing the pad field (the remains
+	 * of the old uuid field) when we actually convert the inode to
+	 * the new format. We don't change the version number so that we
+	 * can distinguish this from a real new format inode.
+	 */
+	if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+		ip->i_d.di_nlink = ip->i_d.di_onlink;
+		ip->i_d.di_onlink = 0;
+		ip->i_d.di_projid = 0;
+	}
+
+	ip->i_delayed_blks = 0;
+
+	/*
+	 * Mark the buffer containing the inode as something to keep
+	 * around for a while.	This helps to keep recently accessed
+	 * meta-data in-core longer.
+	 */
+	 XFS_BUF_SET_REF(bp, XFS_INO_REF);
+
+	/*
+	 * Use xfs_trans_brelse() to release the buffer containing the
+	 * on-disk inode, because it was acquired with xfs_trans_read_buf()
+	 * in xfs_itobp() above.  If tp is NULL, this is just a normal
+	 * brelse().  If we're within a transaction, then xfs_trans_brelse()
+	 * will only release the buffer if it is not dirty within the
+	 * transaction.	 It will be OK to release the buffer in this case,
+	 * because inodes on disk are never destroyed and we will be
+	 * locking the new in-core inode before putting it in the hash
+	 * table where other processes can find it.  Thus we don't have
+	 * to worry about the inode being changed just because we released
+	 * the buffer.
+	 */
+	xfs_trans_brelse(tp, bp);
+	*ipp = ip;
+	return 0;
+}
+
+/*
+ * Read in extents from a btree-format inode.
+ * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
+ */
+int
+xfs_iread_extents(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	int		whichfork)
+{
+	int		error;
+	xfs_ifork_t	*ifp;
+	size_t		size;
+
+	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
+		return XFS_ERROR(EFSCORRUPTED);
+	size = XFS_IFORK_NEXTENTS(ip, whichfork) * (uint)sizeof(xfs_bmbt_rec_t);
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	/*
+	 * We know that the size is legal (it's checked in iformat_btree)
+	 */
+	ifp->if_u1.if_extents = kmem_alloc(size, KM_SLEEP);
+	ASSERT(ifp->if_u1.if_extents != NULL);
+	ifp->if_lastex = NULLEXTNUM;
+	ifp->if_bytes = ifp->if_real_bytes = (int)size;
+	ifp->if_flags |= XFS_IFEXTENTS;
+	error = xfs_bmap_read_extents(tp, ip, whichfork);
+	if (error) {
+		kmem_free(ifp->if_u1.if_extents, size);
+		ifp->if_u1.if_extents = NULL;
+		ifp->if_bytes = ifp->if_real_bytes = 0;
+		ifp->if_flags &= ~XFS_IFEXTENTS;
+		return error;
+	}
+	xfs_validate_extents((xfs_bmbt_rec_32_t *)ifp->if_u1.if_extents,
+		XFS_IFORK_NEXTENTS(ip, whichfork), XFS_EXTFMT_INODE(ip));
+	return 0;
+}
+
+/*
+ * Allocate an inode on disk and return a copy of it's in-core version.
+ * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
+ * appropriately within the inode.  The uid and gid for the inode are
+ * set according to the contents of the given cred structure.
+ *
+ * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
+ * has a free inode available, call xfs_iget()
+ * to obtain the in-core version of the allocated inode.  Finally,
+ * fill in the inode and log its initial contents.  In this case,
+ * ialloc_context would be set to NULL and call_again set to false.
+ *
+ * If xfs_dialloc() does not have an available inode,
+ * it will replenish its supply by doing an allocation. Since we can
+ * only do one allocation within a transaction without deadlocks, we
+ * must commit the current transaction before returning the inode itself.
+ * In this case, therefore, we will set call_again to true and return.
+ * The caller should then commit the current transaction, start a new
+ * transaction, and call xfs_ialloc() again to actually get the inode.
+ *
+ * To ensure that some other process does not grab the inode that
+ * was allocated during the first call to xfs_ialloc(), this routine
+ * also returns the [locked] bp pointing to the head of the freelist
+ * as ialloc_context.  The caller should hold this buffer across
+ * the commit and pass it back into this routine on the second call.
+ */
+int
+xfs_ialloc(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*pip,
+	mode_t		mode,
+	nlink_t		nlink,
+	dev_t		rdev,
+	cred_t		*cr,
+	xfs_prid_t	prid,
+	int		okalloc,
+	xfs_buf_t	**ialloc_context,
+	boolean_t	*call_again,
+	xfs_inode_t	**ipp)
+{
+	xfs_ino_t	ino;
+	xfs_inode_t	*ip;
+	vnode_t		*vp;
+	uint		flags;
+	int		error;
+
+	/*
+	 * Call the space management code to pick
+	 * the on-disk inode to be allocated.
+	 */
+	ASSERT(pip != NULL);
+	error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
+			    ialloc_context, call_again, &ino);
+	if (error != 0) {
+		return error;
+	}
+	if (*call_again || ino == NULLFSINO) {
+		*ipp = NULL;
+		return 0;
+	}
+	ASSERT(*ialloc_context == NULL);
+
+	/*
+	 * Get the in-core inode with the lock held exclusively.
+	 * This is because we're setting fields here we need
+	 * to prevent others from looking at until we're done.
+	 */
+	error = xfs_trans_iget(tp->t_mountp, tp, ino, XFS_ILOCK_EXCL, &ip);
+	if (error != 0) {
+		return error;
+	}
+	ASSERT(ip != NULL);
+
+	vp = XFS_ITOV(ip);
+	vp->v_type = IFTOVT(mode);
+	ip->i_d.di_mode = (__uint16_t)mode;
+	ip->i_d.di_onlink = 0;
+	ip->i_d.di_nlink = nlink;
+	ASSERT(ip->i_d.di_nlink == nlink);
+	ip->i_d.di_uid = current->fsuid;
+	ip->i_d.di_gid = current->fsgid;
+	ip->i_d.di_projid = prid;
+	bzero(&(ip->i_d.di_pad[0]), sizeof(ip->i_d.di_pad));
+
+	/* now that we have a v_type we can set Linux inode ops (& unlock) */
+	linvfs_set_inode_ops(LINVFS_GET_IP(XFS_ITOV(ip)));
+
+	/*
+	 * If the superblock version is up to where we support new format
+	 * inodes and this is currently an old format inode, then change
+	 * the inode version number now.  This way we only do the conversion
+	 * here rather than here and in the flush/logging code.
+	 */
+	if (XFS_SB_VERSION_HASNLINK(&tp->t_mountp->m_sb) &&
+	    ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+		ip->i_d.di_version = XFS_DINODE_VERSION_2;
+		/*
+		 * We've already zeroed the old link count, the projid field,
+		 * and the pad field.
+		 */
+	}
+
+	/*
+	 * Project ids won't be stored on disk if we are using a version 1 inode.
+	 */
+	if ( (prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1))
+		xfs_bump_ino_vers2(tp, ip);
+
+	if (XFS_INHERIT_GID(pip, vp->v_vfsp)) {
+		ip->i_d.di_gid = pip->i_d.di_gid;
+		if ((pip->i_d.di_mode & ISGID) && (mode & IFMT) == IFDIR) {
+			ip->i_d.di_mode |= ISGID;
+		}
+	}
+
+	/*
+	 * If the group ID of the new file does not match the effective group
+	 * ID or one of the supplementary group IDs, the ISGID bit is
+	 * cleared if the "irixsgid" mount option is set.
+	 */
+	if (ip->i_d.di_mode & ISGID) {
+		if (!in_group_p((gid_t)ip->i_d.di_gid)
+		    && (ip->i_mount->m_flags & XFS_MOUNT_IRIXSGID)) {
+			ip->i_d.di_mode &= ~ISGID;
+		}
+	}
+
+	ip->i_d.di_size = 0;
+	ip->i_d.di_nextents = 0;
+	ASSERT(ip->i_d.di_nblocks == 0);
+	xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD);
+	/*
+	 * di_gen will have been taken care of in xfs_iread.
+	 */
+	ip->i_d.di_extsize = 0;
+	ip->i_d.di_dmevmask = 0;
+	ip->i_d.di_dmstate = 0;
+	ip->i_d.di_flags = 0;
+	flags = XFS_ILOG_CORE;
+	switch (mode & IFMT) {
+	case IFIFO:
+	case IFCHR:
+	case IFBLK:
+	case IFSOCK:
+		ip->i_d.di_format = XFS_DINODE_FMT_DEV;
+		ip->i_df.if_u2.if_rdev = IRIX_MKDEV(MAJOR(rdev), MINOR(rdev));
+		ip->i_df.if_flags = 0;
+		flags |= XFS_ILOG_DEV;
+		break;
+	case IFREG:
+	case IFDIR:
+	case IFLNK:
+		ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
+		ip->i_df.if_flags = XFS_IFEXTENTS;
+		ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
+		ip->i_df.if_u1.if_extents = NULL;
+		break;
+	default:
+		ASSERT(0);
+	}
+	/*
+	 * Attribute fork settings for new inode.
+	 */
+	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+	ip->i_d.di_anextents = 0;
+
+#if DEBUG
+	{
+		uint	badflags = VNOSWAP |
+			       VISSWAP |
+			       VREPLICABLE |
+			   /*  VNONREPLICABLE | XXX uncomment this */
+			       VDOCMP |
+			       VFRLOCKS;
+
+		/*
+		 * For shared mounts, VNOSWAP is set in xfs_iget
+		 */
+		if (tp->t_mountp->m_cxfstype != XFS_CXFS_NOT)
+			badflags &= ~VNOSWAP;
+
+		ASSERT(!(vp->v_flag & badflags));
+	}
+#endif /* DEBUG */
+
+	/*
+	 * Log the new values stuffed into the inode.
+	 */
+	xfs_trans_log_inode(tp, ip, flags);
+	*ipp = ip;
+	return 0;
+}
+
+/*
+ * Check to make sure that there are no blocks allocated to the
+ * file beyond the size of the file.  We don't check this for
+ * files with fixed size extents or real time extents, but we
+ * at least do it for regular files.
+ */
+#ifdef DEBUG
+void
+xfs_isize_check(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*ip,
+	xfs_fsize_t	isize)
+{
+	xfs_fileoff_t	map_first;
+	int		nimaps;
+	xfs_bmbt_irec_t imaps[2];
+
+	if ((ip->i_d.di_mode & IFMT) != IFREG)
+		return;
+
+	if ( ip->i_d.di_flags & XFS_DIFLAG_REALTIME )
+		return;
+
+	nimaps = 2;
+	map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
+	/*
+	 * The filesystem could be shutting down, so bmapi may return
+	 * an error.
+	 */
+	if (xfs_bmapi(NULL, ip, map_first,
+			 (XFS_B_TO_FSB(mp,
+				       (xfs_ufsize_t)XFS_MAX_FILE_OFFSET) -
+			  map_first),
+			 XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
+			 NULL))
+	    return;
+	ASSERT(nimaps == 1);
+	ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
+}
+#endif	/* DEBUG */
+
+/*
+ * Calculate the last possible buffered byte in a file.	 This must
+ * include data that was buffered beyond the EOF by the write code.
+ * This also needs to deal with overflowing the xfs_fsize_t type
+ * which can happen for sizes near the limit.
+ *
+ * We also need to take into account any blocks beyond the EOF.	 It
+ * may be the case that they were buffered by a write which failed.
+ * In that case the pages will still be in memory, but the inode size
+ * will never have been updated.
+ */
+xfs_fsize_t
+xfs_file_last_byte(
+	xfs_inode_t	*ip)
+{
+	xfs_mount_t	*mp;
+	xfs_fsize_t	last_byte;
+	xfs_fileoff_t	last_block;
+	xfs_fileoff_t	size_last_block;
+	int		error;
+
+	ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE | MR_ACCESS));
+
+	mp = ip->i_mount;
+	/*
+	 * Only check for blocks beyond the EOF if the extents have
+	 * been read in.  This eliminates the need for the inode lock,
+	 * and it also saves us from looking when it really isn't
+	 * necessary.
+	 */
+	if (ip->i_df.if_flags & XFS_IFEXTENTS) {
+		error = xfs_bmap_last_offset(NULL, ip, &last_block,
+			XFS_DATA_FORK);
+		if (error) {
+			last_block = 0;
+		}
+	} else {
+		last_block = 0;
+	}
+	size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_d.di_size);
+	last_block = XFS_FILEOFF_MAX(last_block, size_last_block);
+
+	last_byte = XFS_FSB_TO_B(mp, last_block);
+	if (last_byte < 0) {
+		return XFS_MAX_FILE_OFFSET;
+	}
+	last_byte += (1 << mp->m_writeio_log);
+	if (last_byte < 0) {
+		return XFS_MAX_FILE_OFFSET;
+	}
+	return last_byte;
+}
+
+#if defined(XFS_RW_TRACE)
+STATIC void
+xfs_itrunc_trace(
+	int		tag,
+	xfs_inode_t	*ip,
+	int		flag,
+	xfs_fsize_t	new_size,
+	xfs_off_t	toss_start,
+	xfs_off_t	toss_finish)
+{
+	if (ip->i_rwtrace == NULL) {
+		return;
+	}
+
+	ktrace_enter(ip->i_rwtrace,
+		     (void*)((long)tag),
+		     (void*)ip,
+		     (void*)((ip->i_d.di_size >> 32) & 0xffffffff),
+		     (void*)(ip->i_d.di_size & 0xffffffff),
+		     (void*)((long)flag),
+		     (void*)((new_size >> 32) & 0xffffffff),
+		     (void*)(new_size & 0xffffffff),
+		     (void*)((toss_start >> 32) & 0xffffffff),
+		     (void*)(toss_start & 0xffffffff),
+		     (void*)((toss_finish >> 32) & 0xffffffff),
+		     (void*)(toss_finish & 0xffffffff),
+		     (void*)((long)private.p_cpuid),
+		     (void*)0,
+		     (void*)0,
+		     (void*)0,
+		     (void*)0);
+}
+#else
+#define xfs_itrunc_trace(tag, ip, flag, new_size, toss_start, toss_finish)
+#endif
+
+/*
+ * Start the truncation of the file to new_size.  The new size
+ * must be smaller than the current size.  This routine will
+ * clear the buffer and page caches of file data in the removed
+ * range, and xfs_itruncate_finish() will remove the underlying
+ * disk blocks.
+ *
+ * The inode must have its I/O lock locked EXCLUSIVELY, and it
+ * must NOT have the inode lock held at all.  This is because we're
+ * calling into the buffer/page cache code and we can't hold the
+ * inode lock when we do so.
+ *
+ * The flags parameter can have either the value XFS_ITRUNC_DEFINITE
+ * or XFS_ITRUNC_MAYBE.	 The XFS_ITRUNC_MAYBE value should be used
+ * in the case that the caller is locking things out of order and
+ * may not be able to call xfs_itruncate_finish() with the inode lock
+ * held without dropping the I/O lock.	If the caller must drop the
+ * I/O lock before calling xfs_itruncate_finish(), then xfs_itruncate_start()
+ * must be called again with all the same restrictions as the initial
+ * call.
+ */
+void
+xfs_itruncate_start(
+	xfs_inode_t	*ip,
+	uint		flags,
+	xfs_fsize_t	new_size)
+{
+	xfs_fsize_t	last_byte;
+	xfs_off_t	toss_start;
+	xfs_mount_t	*mp;
+	vnode_t		*vp;
+
+	ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
+	ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size));
+	ASSERT((flags == XFS_ITRUNC_DEFINITE) ||
+	       (flags == XFS_ITRUNC_MAYBE));
+
+	mp = ip->i_mount;
+	vp = XFS_ITOV(ip);
+	/*
+	 * Call VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES() to get rid of pages and buffers
+	 * overlapping the region being removed.  We have to use
+	 * the less efficient VOP_FLUSHINVAL_PAGES() in the case that the
+	 * caller may not be able to finish the truncate without
+	 * dropping the inode's I/O lock.  Make sure
+	 * to catch any pages brought in by buffers overlapping
+	 * the EOF by searching out beyond the isize by our
+	 * block size. We round new_size up to a block boundary
+	 * so that we don't toss things on the same block as
+	 * new_size but before it.
+	 *
+	 * Before calling VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES(), make sure to
+	 * call remapf() over the same region if the file is mapped.
+	 * This frees up mapped file references to the pages in the
+	 * given range and for the VOP_FLUSHINVAL_PAGES() case it ensures
+	 * that we get the latest mapped changes flushed out.
+	 */
+	toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
+	toss_start = XFS_FSB_TO_B(mp, toss_start);
+	if (toss_start < 0) {
+		/*
+		 * The place to start tossing is beyond our maximum
+		 * file size, so there is no way that the data extended
+		 * out there.
+		 */
+		return;
+	}
+	last_byte = xfs_file_last_byte(ip);
+	xfs_itrunc_trace(XFS_ITRUNC_START, ip, flags, new_size, toss_start,
+			 last_byte);
+	if (last_byte > toss_start) {
+		if (flags & XFS_ITRUNC_DEFINITE) {
+			VOP_TOSS_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED);
+		} else {
+			VOP_FLUSHINVAL_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED);
+		}
+	}
+
+#ifdef DEBUG
+	if (new_size == 0) {
+		ASSERT(VN_CACHED(vp) == 0);
+	}
+#endif
+}
+
+/*
+ * Shrink the file to the given new_size.  The new
+ * size must be smaller than the current size.
+ * This will free up the underlying blocks
+ * in the removed range after a call to xfs_itruncate_start()
+ * or xfs_atruncate_start().
+ *
+ * The transaction passed to this routine must have made
+ * a permanent log reservation of at least XFS_ITRUNCATE_LOG_RES.
+ * This routine may commit the given transaction and
+ * start new ones, so make sure everything involved in
+ * the transaction is tidy before calling here.
+ * Some transaction will be returned to the caller to be
+ * committed.  The incoming transaction must already include
+ * the inode, and both inode locks must be held exclusively.
+ * The inode must also be "held" within the transaction.  On
+ * return the inode will be "held" within the returned transaction.
+ * This routine does NOT require any disk space to be reserved
+ * for it within the transaction.
+ *
+ * The fork parameter must be either xfs_attr_fork or xfs_data_fork,
+ * and it indicates the fork which is to be truncated.	For the
+ * attribute fork we only support truncation to size 0.
+ *
+ * We use the sync parameter to indicate whether or not the first
+ * transaction we perform might have to be synchronous.	 For the attr fork,
+ * it needs to be so if the unlink of the inode is not yet known to be
+ * permanent in the log.  This keeps us from freeing and reusing the
+ * blocks of the attribute fork before the unlink of the inode becomes
+ * permanent.
+ *
+ * For the data fork, we normally have to run synchronously if we're
+ * being called out of the inactive path or we're being called
+ * out of the create path where we're truncating an existing file.
+ * Either way, the truncate needs to be sync so blocks don't reappear
+ * in the file with altered data in case of a crash.  wsync filesystems
+ * can run the first case async because anything that shrinks the inode
+ * has to run sync so by the time we're called here from inactive, the
+ * inode size is permanently set to 0.
+ *
+ * Calls from the truncate path always need to be sync unless we're
+ * in a wsync filesystem and the file has already been unlinked.
+ *
+ * The caller is responsible for correctly setting the sync parameter.
+ * It gets too hard for us to guess here which path we're being called
+ * out of just based on inode state.
+ */
+int
+xfs_itruncate_finish(
+	xfs_trans_t	**tp,
+	xfs_inode_t	*ip,
+	xfs_fsize_t	new_size,
+	int		fork,
+	int		sync)
+{
+	xfs_fsblock_t	first_block;
+	xfs_fileoff_t	first_unmap_block;
+	xfs_fileoff_t	last_block;
+	xfs_filblks_t	unmap_len=0;
+	xfs_mount_t	*mp;
+	xfs_trans_t	*ntp;
+	int		done;
+	int		committed;
+	xfs_bmap_free_t free_list;
+	int		error;
+
+	ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
+	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0);
+	ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size));
+	ASSERT(*tp != NULL);
+	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+	ASSERT(ip->i_transp == *tp);
+	ASSERT(ip->i_itemp != NULL);
+	ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD);
+
+
+	ntp = *tp;
+	mp = (ntp)->t_mountp;
+	ASSERT(! XFS_NOT_DQATTACHED(mp, ip));
+
+	/*
+	 * We only support truncating the entire attribute fork.
+	 */
+	if (fork == XFS_ATTR_FORK) {
+		new_size = 0LL;
+	}
+	first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
+	xfs_itrunc_trace(XFS_ITRUNC_FINISH1, ip, 0, new_size, 0, 0);
+	/*
+	 * The first thing we do is set the size to new_size permanently
+	 * on disk.  This way we don't have to worry about anyone ever
+	 * being able to look at the data being freed even in the face
+	 * of a crash.	What we're getting around here is the case where
+	 * we free a block, it is allocated to another file, it is written
+	 * to, and then we crash.  If the new data gets written to the
+	 * file but the log buffers containing the free and reallocation
+	 * don't, then we'd end up with garbage in the blocks being freed.
+	 * As long as we make the new_size permanent before actually
+	 * freeing any blocks it doesn't matter if they get writtten to.
+	 *
+	 * The callers must signal into us whether or not the size
+	 * setting here must be synchronous.  There are a few cases
+	 * where it doesn't have to be synchronous.  Those cases
+	 * occur if the file is unlinked and we know the unlink is
+	 * permanent or if the blocks being truncated are guaranteed
+	 * to be beyond the inode eof (regardless of the link count)
+	 * and the eof value is permanent.  Both of these cases occur
+	 * only on wsync-mounted filesystems.  In those cases, we're
+	 * guaranteed that no user will ever see the data in the blocks
+	 * that are being truncated so the truncate can run async.
+	 * In the free beyond eof case, the file may wind up with
+	 * more blocks allocated to it than it needs if we crash
+	 * and that won't get fixed until the next time the file
+	 * is re-opened and closed but that's ok as that shouldn't
+	 * be too many blocks.
+	 *
+	 * However, we can't just make all wsync xactions run async
+	 * because there's one call out of the create path that needs
+	 * to run sync where it's truncating an existing file to size
+	 * 0 whose size is > 0.
+	 *
+	 * It's probably possible to come up with a test in this
+	 * routine that would correctly distinguish all the above
+	 * cases from the values of the function parameters and the
+	 * inode state but for sanity's sake, I've decided to let the
+	 * layers above just tell us.  It's simpler to correctly figure
+	 * out in the layer above exactly under what conditions we
+	 * can run async and I think it's easier for others read and
+	 * follow the logic in case something has to be changed.
+	 * cscope is your friend -- rcc.
+	 *
+	 * The attribute fork is much simpler.
+	 *
+	 * For the attribute fork we allow the caller to tell us whether
+	 * the unlink of the inode that led to this call is yet permanent
+	 * in the on disk log.	If it is not and we will be freeing extents
+	 * in this inode then we make the first transaction synchronous
+	 * to make sure that the unlink is permanent by the time we free
+	 * the blocks.
+	 */
+	if (fork == XFS_DATA_FORK) {
+		if (ip->i_d.di_nextents > 0) {
+			ip->i_d.di_size = new_size;
+			xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
+		}
+	} else if (sync) {
+		ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC));
+		if (ip->i_d.di_anextents > 0)
+			xfs_trans_set_sync(ntp);
+	}
+	ASSERT(fork == XFS_DATA_FORK ||
+		(fork == XFS_ATTR_FORK &&
+			((sync && !(mp->m_flags & XFS_MOUNT_WSYNC)) ||
+			 (sync == 0 && (mp->m_flags & XFS_MOUNT_WSYNC)))));
+
+	/*
+	 * Since it is possible for space to become allocated beyond
+	 * the end of the file (in a crash where the space is allocated
+	 * but the inode size is not yet updated), simply remove any
+	 * blocks which show up between the new EOF and the maximum
+	 * possible file size.	If the first block to be removed is
+	 * beyond the maximum file size (ie it is the same as last_block),
+	 * then there is nothing to do.
+	 */
+	last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAX_FILE_OFFSET);
+	ASSERT(first_unmap_block <= last_block);
+	done = 0;
+	if (last_block == first_unmap_block) {
+		done = 1;
+	} else {
+		unmap_len = last_block - first_unmap_block + 1;
+	}
+	while (!done) {
+		/*
+		 * Free up up to XFS_ITRUNC_MAX_EXTENTS.  xfs_bunmapi()
+		 * will tell us whether it freed the entire range or
+		 * not.	 If this is a synchronous mount (wsync),
+		 * then we can tell bunmapi to keep all the
+		 * transactions asynchronous since the unlink
+		 * transaction that made this inode inactive has
+		 * already hit the disk.  There's no danger of
+		 * the freed blocks being reused, there being a
+		 * crash, and the reused blocks suddenly reappearing
+		 * in this file with garbage in them once recovery
+		 * runs.
+		 */
+		XFS_BMAP_INIT(&free_list, &first_block);
+		error = xfs_bunmapi(ntp, ip, first_unmap_block,
+				    unmap_len,
+				    XFS_BMAPI_AFLAG(fork) |
+				      (sync ? 0 : XFS_BMAPI_ASYNC),
+				    XFS_ITRUNC_MAX_EXTENTS,
+				    &first_block, &free_list, &done);
+		if (error) {
+			/*
+			 * If the bunmapi call encounters an error,
+			 * return to the caller where the transaction
+			 * can be properly aborted.  We just need to
+			 * make sure we're not holding any resources
+			 * that we were not when we came in.
+			 */
+			xfs_bmap_cancel(&free_list);
+			return error;
+		}
+
+		/*
+		 * Duplicate the transaction that has the permanent
+		 * reservation and commit the old transaction.
+		 */
+		error = xfs_bmap_finish(tp, &free_list, first_block,
+					&committed);
+		ntp = *tp;
+		if (error) {
+			/*
+			 * If the bmap finish call encounters an error,
+			 * return to the caller where the transaction
+			 * can be properly aborted.  We just need to
+			 * make sure we're not holding any resources
+			 * that we were not when we came in.
+			 *
+			 * Aborting from this point might lose some
+			 * blocks in the file system, but oh well.
+			 */
+			xfs_bmap_cancel(&free_list);
+			if (committed) {
+				/*
+				 * If the passed in transaction committed
+				 * in xfs_bmap_finish(), then we want to
+				 * add the inode to this one before returning.
+				 * This keeps things simple for the higher
+				 * level code, because it always knows that
+				 * the inode is locked and held in the
+				 * transaction that returns to it whether
+				 * errors occur or not.	 We don't mark the
+				 * inode dirty so that this transaction can
+				 * be easily aborted if possible.
+				 */
+				xfs_trans_ijoin(ntp, ip,
+					XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+				xfs_trans_ihold(ntp, ip);
+			}
+			return error;
+		}
+
+		if (committed) {
+			/*
+			 * The first xact was committed,
+			 * so add the inode to the new one.
+			 * Mark it dirty so it will be logged
+			 * and moved forward in the log as
+			 * part of every commit.
+			 */
+			xfs_trans_ijoin(ntp, ip,
+					XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+			xfs_trans_ihold(ntp, ip);
+			xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
+		}
+		ntp = xfs_trans_dup(ntp);
+		(void) xfs_trans_commit(*tp, 0, NULL);
+		*tp = ntp;
+		error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+					  XFS_TRANS_PERM_LOG_RES,
+					  XFS_ITRUNCATE_LOG_COUNT);
+		/*
+		 * Add the inode being truncated to the next chained
+		 * transaction.
+		 */
+		xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+		xfs_trans_ihold(ntp, ip);
+		if (error)
+			return (error);
+	}
+	/*
+	 * Only update the size in the case of the data fork, but
+	 * always re-log the inode so that our permanent transaction
+	 * can keep on rolling it forward in the log.
+	 */
+	if (fork == XFS_DATA_FORK) {
+		xfs_isize_check(mp, ip, new_size);
+		ip->i_d.di_size = new_size;
+	}
+	xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
+	ASSERT((new_size != 0) ||
+	       (fork == XFS_ATTR_FORK) ||
+	       (ip->i_delayed_blks == 0));
+	ASSERT((new_size != 0) ||
+	       (fork == XFS_ATTR_FORK) ||
+	       (ip->i_d.di_nextents == 0));
+	xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0);
+	return 0;
+}
+
+
+/*
+ * xfs_igrow_start
+ *
+ * Do the first part of growing a file: zero any data in the last
+ * block that is beyond the old EOF.  We need to do this before
+ * the inode is joined to the transaction to modify the i_size.
+ * That way we can drop the inode lock and call into the buffer
+ * cache to get the buffer mapping the EOF.
+ */
+int
+xfs_igrow_start(
+	xfs_inode_t	*ip,
+	xfs_fsize_t	new_size,
+	cred_t		*credp)
+{
+	xfs_fsize_t	isize;
+	int		error;
+
+	ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
+	ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
+	ASSERT(new_size > ip->i_d.di_size);
+
+	error = 0;
+	isize = ip->i_d.di_size;
+	/*
+	 * Zero any pages that may have been created by
+	 * xfs_write_file() beyond the end of the file
+	 * and any blocks between the old and new file sizes.
+	 */
+	error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size, isize,
+				new_size, NULL);
+	return error;
+}
+
+/*
+ * xfs_igrow_finish
+ *
+ * This routine is called to extend the size of a file.
+ * The inode must have both the iolock and the ilock locked
+ * for update and it must be a part of the current transaction.
+ * The xfs_igrow_start() function must have been called previously.
+ * If the change_flag is not zero, the inode change timestamp will
+ * be updated.
+ */
+void
+xfs_igrow_finish(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	xfs_fsize_t	new_size,
+	int		change_flag)
+{
+	ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
+	ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
+	ASSERT(ip->i_transp == tp);
+	ASSERT(new_size > ip->i_d.di_size);
+
+	/*
+	 * Update the file size.  Update the inode change timestamp
+	 * if change_flag set.
+	 */
+	ip->i_d.di_size = new_size;
+	if (change_flag)
+		xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+}
+
+
+/*
+ * This is called when the inode's link count goes to 0.
+ * We place the on-disk inode on a list in the AGI.  It
+ * will be pulled from this list when the inode is freed.
+ */
+int
+xfs_iunlink(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip)
+{
+	xfs_mount_t	*mp;
+	xfs_agi_t	*agi;
+	xfs_dinode_t	*dip;
+	xfs_buf_t	*agibp;
+	xfs_buf_t	*ibp;
+	xfs_agnumber_t	agno;
+	xfs_daddr_t	agdaddr;
+	xfs_agino_t	agino;
+	short		bucket_index;
+	int		offset;
+	int		error;
+	int		agi_ok;
+
+	ASSERT(ip->i_d.di_nlink == 0);
+	ASSERT(ip->i_d.di_mode != 0);
+	ASSERT(ip->i_transp == tp);
+
+	mp = tp->t_mountp;
+
+	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR);
+
+	/*
+	 * Get the agi buffer first.  It ensures lock ordering
+	 * on the list.
+	 */
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
+				   1, 0, &agibp);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Validate the magic number of the agi block.
+	 */
+	agi = XFS_BUF_TO_AGI(agibp);
+	agi_ok =
+		INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC &&
+		XFS_AGI_GOOD_VERSION(INT_GET(agi->agi_versionnum, ARCH_CONVERT));
+	if (XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK,
+			XFS_RANDOM_IUNLINK)) {
+		xfs_trans_brelse(tp, agibp);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	/*
+	 * Get the index into the agi hash table for the
+	 * list this inode will go on.
+	 */
+	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	ASSERT(agino != 0);
+	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+	ASSERT(!INT_ISZERO(agi->agi_unlinked[bucket_index], ARCH_CONVERT));
+	ASSERT(INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) != agino);
+
+	if (INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) != NULLAGINO) {
+		/*
+		 * There is already another inode in the bucket we need
+		 * to add ourselves to.	 Add us at the front of the list.
+		 * Here we put the head pointer into our next pointer,
+		 * and then we fall through to point the head at us.
+		 */
+		error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
+		if (error) {
+			return error;
+		}
+		ASSERT(INT_GET(dip->di_next_unlinked, ARCH_CONVERT) == NULLAGINO);
+		ASSERT(!INT_ISZERO(dip->di_next_unlinked, ARCH_CONVERT));
+		/* both on-disk, don't endian flip twice */
+		dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
+		offset = ip->i_boffset +
+			offsetof(xfs_dinode_t, di_next_unlinked);
+		xfs_trans_inode_buf(tp, ibp);
+		xfs_trans_log_buf(tp, ibp, offset,
+				  (offset + sizeof(xfs_agino_t) - 1));
+		xfs_inobp_check(mp, ibp);
+	}
+
+	/*
+	 * Point the bucket head pointer at the inode being inserted.
+	 */
+	ASSERT(agino != 0);
+	INT_SET(agi->agi_unlinked[bucket_index], ARCH_CONVERT, agino);
+	offset = offsetof(xfs_agi_t, agi_unlinked) +
+		(sizeof(xfs_agino_t) * bucket_index);
+	xfs_trans_log_buf(tp, agibp, offset,
+			  (offset + sizeof(xfs_agino_t) - 1));
+	return 0;
+}
+
+/*
+ * Pull the on-disk inode from the AGI unlinked list.
+ */
+STATIC int
+xfs_iunlink_remove(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip)
+{
+	xfs_ino_t	next_ino;
+	xfs_mount_t	*mp;
+	xfs_agi_t	*agi;
+	xfs_dinode_t	*dip;
+	xfs_buf_t	*agibp;
+	xfs_buf_t	*ibp;
+	xfs_agnumber_t	agno;
+	xfs_daddr_t	agdaddr;
+	xfs_agino_t	agino;
+	xfs_agino_t	next_agino;
+	xfs_buf_t	*last_ibp;
+	xfs_dinode_t	*last_dip;
+	short		bucket_index;
+	int		offset, last_offset;
+	int		error;
+	int		agi_ok;
+
+	/*
+	 * First pull the on-disk inode from the AGI unlinked list.
+	 */
+	mp = tp->t_mountp;
+
+	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+	agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR);
+
+	/*
+	 * Get the agi buffer first.  It ensures lock ordering
+	 * on the list.
+	 */
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
+				   1, 0, &agibp);
+	if (error != 0) {
+		cmn_err(CE_WARN,
+			"xfs_iunlink_remove: xfs_trans_read_buf()  returned an error %d on %s.	Returning error.",
+			error, mp->m_fsname);
+		return error;
+	}
+	/*
+	 * Validate the magic number of the agi block.
+	 */
+	agi = XFS_BUF_TO_AGI(agibp);
+	agi_ok =
+		INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC &&
+		XFS_AGI_GOOD_VERSION(INT_GET(agi->agi_versionnum, ARCH_CONVERT));
+	if (XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK_REMOVE,
+			XFS_RANDOM_IUNLINK_REMOVE)) {
+		xfs_trans_brelse(tp, agibp);
+		cmn_err(CE_WARN,
+			"xfs_iunlink_remove: XFS_TEST_ERROR()  returned an error on %s.	 Returning EFSCORRUPTED.",
+			 mp->m_fsname);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	/*
+	 * Get the index into the agi hash table for the
+	 * list this inode will go on.
+	 */
+	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+	ASSERT(agino != 0);
+	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+	ASSERT(INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) != NULLAGINO);
+	ASSERT(!INT_ISZERO(agi->agi_unlinked[bucket_index], ARCH_CONVERT));
+
+	if (INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT) == agino) {
+		/*
+		 * We're at the head of the list.  Get the inode's
+		 * on-disk buffer to see if there is anyone after us
+		 * on the list.	 Only modify our next pointer if it
+		 * is not already NULLAGINO.  This saves us the overhead
+		 * of dealing with the buffer when there is no need to
+		 * change it.
+		 */
+		error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
+		if (error) {
+			cmn_err(CE_WARN,
+				"xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
+				error, mp->m_fsname);
+			return error;
+		}
+		next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT);
+		ASSERT(next_agino != 0);
+		if (next_agino != NULLAGINO) {
+			INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO);
+			offset = ip->i_boffset +
+				offsetof(xfs_dinode_t, di_next_unlinked);
+			xfs_trans_inode_buf(tp, ibp);
+			xfs_trans_log_buf(tp, ibp, offset,
+					  (offset + sizeof(xfs_agino_t) - 1));
+			xfs_inobp_check(mp, ibp);
+		} else {
+			xfs_trans_brelse(tp, ibp);
+		}
+		/*
+		 * Point the bucket head pointer at the next inode.
+		 */
+		ASSERT(next_agino != 0);
+		ASSERT(next_agino != agino);
+		INT_SET(agi->agi_unlinked[bucket_index], ARCH_CONVERT, next_agino);
+		offset = offsetof(xfs_agi_t, agi_unlinked) +
+			(sizeof(xfs_agino_t) * bucket_index);
+		xfs_trans_log_buf(tp, agibp, offset,
+				  (offset + sizeof(xfs_agino_t) - 1));
+	} else {
+		/*
+		 * We need to search the list for the inode being freed.
+		 */
+		next_agino = INT_GET(agi->agi_unlinked[bucket_index], ARCH_CONVERT);
+		last_ibp = NULL;
+		while (next_agino != agino) {
+			/*
+			 * If the last inode wasn't the one pointing to
+			 * us, then release its buffer since we're not
+			 * going to do anything with it.
+			 */
+			if (last_ibp != NULL) {
+				xfs_trans_brelse(tp, last_ibp);
+			}
+			next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
+			error = xfs_inotobp(mp, tp, next_ino, &last_dip,
+					    &last_ibp, &last_offset);
+			if (error) {
+				cmn_err(CE_WARN,
+			"xfs_iunlink_remove: xfs_inotobp()  returned an error %d on %s.	 Returning error.",
+					error, mp->m_fsname);
+				return error;
+			}
+			next_agino = INT_GET(last_dip->di_next_unlinked, ARCH_CONVERT);
+			ASSERT(next_agino != NULLAGINO);
+			ASSERT(next_agino != 0);
+		}
+		/*
+		 * Now last_ibp points to the buffer previous to us on
+		 * the unlinked list.  Pull us from the list.
+		 */
+		error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
+		if (error) {
+			cmn_err(CE_WARN,
+				"xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
+				error, mp->m_fsname);
+			return error;
+		}
+		next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT);
+		ASSERT(next_agino != 0);
+		ASSERT(next_agino != agino);
+		if (next_agino != NULLAGINO) {
+			INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO);
+			offset = ip->i_boffset +
+				offsetof(xfs_dinode_t, di_next_unlinked);
+			xfs_trans_inode_buf(tp, ibp);
+			xfs_trans_log_buf(tp, ibp, offset,
+					  (offset + sizeof(xfs_agino_t) - 1));
+			xfs_inobp_check(mp, ibp);
+		} else {
+			xfs_trans_brelse(tp, ibp);
+		}
+		/*
+		 * Point the previous inode on the list to the next inode.
+		 */
+		INT_SET(last_dip->di_next_unlinked, ARCH_CONVERT, next_agino);
+		ASSERT(next_agino != 0);
+		offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
+		xfs_trans_inode_buf(tp, last_ibp);
+		xfs_trans_log_buf(tp, last_ibp, offset,
+				  (offset + sizeof(xfs_agino_t) - 1));
+		xfs_inobp_check(mp, last_ibp);
+	}
+	return 0;
+}
+
+/*
+ * This is called to return an inode to the inode free list.
+ * The inode should already be truncated to 0 length and have
+ * no pages associated with it.	 This routine also assumes that
+ * the inode is already a part of the transaction.
+ *
+ * The on-disk copy of the inode will have been added to the list
+ * of unlinked inodes in the AGI. We need to remove the inode from
+ * that list atomically with respect to freeing it here.
+ */
+int
+xfs_ifree(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip)
+{
+	int	error;
+
+	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+	ASSERT(ip->i_transp == tp);
+	ASSERT(ip->i_d.di_nlink == 0);
+	ASSERT(ip->i_d.di_nextents == 0);
+	ASSERT(ip->i_d.di_anextents == 0);
+	ASSERT((ip->i_d.di_size == 0) ||
+	       ((ip->i_d.di_mode & IFMT) != IFREG));
+	ASSERT(ip->i_d.di_nblocks == 0);
+
+	/*
+	 * Pull the on-disk inode from the AGI unlinked list.
+	 */
+	error = xfs_iunlink_remove(tp, ip);
+	if (error != 0) {
+		return error;
+	}
+
+	error = xfs_difree(tp, ip->i_ino);
+	if (error != 0) {
+		return error;
+	}
+	ip->i_d.di_mode = 0;		/* mark incore inode as free */
+	ip->i_d.di_flags = 0;
+	ip->i_d.di_dmevmask = 0;
+	ip->i_d.di_forkoff = 0;		/* mark the attr fork not in use */
+	ip->i_df.if_ext_max =
+		XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+	ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
+	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
+
+	/*
+	 * Bump the generation count so no one will be confused
+	 * by reincarnations of this inode.
+	 */
+	ip->i_d.di_gen++;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	return 0;
+}
+
+/*
+ * Reallocate the space for if_broot based on the number of records
+ * being added or deleted as indicated in rec_diff.  Move the records
+ * and pointers in if_broot to fit the new size.  When shrinking this
+ * will eliminate holes between the records and pointers created by
+ * the caller.	When growing this will create holes to be filled in
+ * by the caller.
+ *
+ * The caller must not request to add more records than would fit in
+ * the on-disk inode root.  If the if_broot is currently NULL, then
+ * if we adding records one will be allocated.	The caller must also
+ * not request that the number of records go below zero, although
+ * it can go to zero.
+ *
+ * ip -- the inode whose if_broot area is changing
+ * ext_diff -- the change in the number of records, positive or negative,
+ *	 requested for the if_broot array.
+ */
+void
+xfs_iroot_realloc(
+	xfs_inode_t		*ip,
+	int			rec_diff,
+	int			whichfork)
+{
+	int			cur_max;
+	xfs_ifork_t		*ifp;
+	xfs_bmbt_block_t	*new_broot;
+	int			new_max;
+	size_t			new_size;
+	char			*np;
+	char			*op;
+
+	/*
+	 * Handle the degenerate case quietly.
+	 */
+	if (rec_diff == 0) {
+		return;
+	}
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (rec_diff > 0) {
+		/*
+		 * If there wasn't any memory allocated before, just
+		 * allocate it now and get out.
+		 */
+		if (ifp->if_broot_bytes == 0) {
+			new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
+			ifp->if_broot = (xfs_bmbt_block_t*)kmem_alloc(new_size,
+								     KM_SLEEP);
+			ifp->if_broot_bytes = (int)new_size;
+			return;
+		}
+
+		/*
+		 * If there is already an existing if_broot, then we need
+		 * to realloc() it and shift the pointers to their new
+		 * location.  The records don't change location because
+		 * they are kept butted up against the btree block header.
+		 */
+		cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
+		new_max = cur_max + rec_diff;
+		new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
+		ifp->if_broot = (xfs_bmbt_block_t *)
+		  kmem_realloc(ifp->if_broot,
+				new_size,
+				(size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
+				KM_SLEEP);
+		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
+						      ifp->if_broot_bytes);
+		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
+						      (int)new_size);
+		ifp->if_broot_bytes = (int)new_size;
+		ASSERT(ifp->if_broot_bytes <=
+			XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
+		ovbcopy(op, np, cur_max * (uint)sizeof(xfs_dfsbno_t));
+		return;
+	}
+
+	/*
+	 * rec_diff is less than 0.  In this case, we are shrinking the
+	 * if_broot buffer.  It must already exist.  If we go to zero
+	 * records, just get rid of the root and clear the status bit.
+	 */
+	ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
+	cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
+	new_max = cur_max + rec_diff;
+	ASSERT(new_max >= 0);
+	if (new_max > 0)
+		new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
+	else
+		new_size = 0;
+	if (new_size > 0) {
+		new_broot = (xfs_bmbt_block_t *)kmem_alloc(new_size, KM_SLEEP);
+		/*
+		 * First copy over the btree block header.
+		 */
+		bcopy(ifp->if_broot, new_broot, sizeof(xfs_bmbt_block_t));
+	} else {
+		new_broot = NULL;
+		ifp->if_flags &= ~XFS_IFBROOT;
+	}
+
+	/*
+	 * Only copy the records and pointers if there are any.
+	 */
+	if (new_max > 0) {
+		/*
+		 * First copy the records.
+		 */
+		op = (char *)XFS_BMAP_BROOT_REC_ADDR(ifp->if_broot, 1,
+						     ifp->if_broot_bytes);
+		np = (char *)XFS_BMAP_BROOT_REC_ADDR(new_broot, 1,
+						     (int)new_size);
+		bcopy(op, np, new_max * (uint)sizeof(xfs_bmbt_rec_t));
+
+		/*
+		 * Then copy the pointers.
+		 */
+		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
+						     ifp->if_broot_bytes);
+		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(new_broot, 1,
+						     (int)new_size);
+		bcopy(op, np, new_max * (uint)sizeof(xfs_dfsbno_t));
+	}
+	kmem_free(ifp->if_broot, ifp->if_broot_bytes);
+	ifp->if_broot = new_broot;
+	ifp->if_broot_bytes = (int)new_size;
+	ASSERT(ifp->if_broot_bytes <=
+		XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
+	return;
+}
+
+
+/*
+ * This is called when the amount of space needed for if_extents
+ * is increased or decreased.  The change in size is indicated by
+ * the number of extents that need to be added or deleted in the
+ * ext_diff parameter.
+ *
+ * If the amount of space needed has decreased below the size of the
+ * inline buffer, then switch to using the inline buffer.  Otherwise,
+ * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
+ * to what is needed.
+ *
+ * ip -- the inode whose if_extents area is changing
+ * ext_diff -- the change in the number of extents, positive or negative,
+ *	 requested for the if_extents array.
+ */
+void
+xfs_iext_realloc(
+	xfs_inode_t	*ip,
+	int		ext_diff,
+	int		whichfork)
+{
+	int		byte_diff;
+	xfs_ifork_t	*ifp;
+	int		new_size;
+	uint		rnew_size;
+
+	if (ext_diff == 0) {
+		return;
+	}
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	byte_diff = ext_diff * (uint)sizeof(xfs_bmbt_rec_t);
+	new_size = (int)ifp->if_bytes + byte_diff;
+	ASSERT(new_size >= 0);
+
+	if (new_size == 0) {
+		if (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext) {
+			ASSERT(ifp->if_real_bytes != 0);
+			kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
+		}
+		ifp->if_u1.if_extents = NULL;
+		rnew_size = 0;
+	} else if (new_size <= sizeof(ifp->if_u2.if_inline_ext)) {
+		/*
+		 * If the valid extents can fit in if_inline_ext,
+		 * copy them from the malloc'd vector and free it.
+		 */
+		if (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext) {
+			/*
+			 * For now, empty files are format EXTENTS,
+			 * so the if_extents pointer is null.
+			 */
+			if (ifp->if_u1.if_extents) {
+				bcopy(ifp->if_u1.if_extents,
+				      ifp->if_u2.if_inline_ext, new_size);
+				kmem_free(ifp->if_u1.if_extents,
+					  ifp->if_real_bytes);
+			}
+			ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
+		}
+		rnew_size = 0;
+	} else {
+		rnew_size = new_size;
+		if ((rnew_size & (rnew_size - 1)) != 0)
+			rnew_size = xfs_iroundup(rnew_size);
+		/*
+		 * Stuck with malloc/realloc.
+		 */
+		if (ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext) {
+			ifp->if_u1.if_extents = (xfs_bmbt_rec_t *)
+				kmem_alloc(rnew_size, KM_SLEEP);
+			bcopy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
+			      sizeof(ifp->if_u2.if_inline_ext));
+		} else if (rnew_size != ifp->if_real_bytes) {
+			ifp->if_u1.if_extents = (xfs_bmbt_rec_t *)
+			  kmem_realloc(ifp->if_u1.if_extents,
+					rnew_size,
+					ifp->if_real_bytes,
+					KM_NOFS);
+		}
+	}
+	ifp->if_real_bytes = rnew_size;
+	ifp->if_bytes = new_size;
+}
+
+
+/*
+ * This is called when the amount of space needed for if_data
+ * is increased or decreased.  The change in size is indicated by
+ * the number of bytes that need to be added or deleted in the
+ * byte_diff parameter.
+ *
+ * If the amount of space needed has decreased below the size of the
+ * inline buffer, then switch to using the inline buffer.  Otherwise,
+ * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
+ * to what is needed.
+ *
+ * ip -- the inode whose if_data area is changing
+ * byte_diff -- the change in the number of bytes, positive or negative,
+ *	 requested for the if_data array.
+ */
+void
+xfs_idata_realloc(
+	xfs_inode_t	*ip,
+	int		byte_diff,
+	int		whichfork)
+{
+	xfs_ifork_t	*ifp;
+	int		new_size;
+	int		real_size;
+
+	if (byte_diff == 0) {
+		return;
+	}
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	new_size = (int)ifp->if_bytes + byte_diff;
+	ASSERT(new_size >= 0);
+
+	if (new_size == 0) {
+		if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+			kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
+		}
+		ifp->if_u1.if_data = NULL;
+		real_size = 0;
+	} else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
+		/*
+		 * If the valid extents/data can fit in if_inline_ext/data,
+		 * copy them from the malloc'd vector and free it.
+		 */
+		if (ifp->if_u1.if_data == NULL) {
+			ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+		} else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+			ASSERT(ifp->if_real_bytes != 0);
+			bcopy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
+			      new_size);
+			kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
+			ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
+		}
+		real_size = 0;
+	} else {
+		/*
+		 * Stuck with malloc/realloc.
+		 * For inline data, the underlying buffer must be
+		 * a multiple of 4 bytes in size so that it can be
+		 * logged and stay on word boundaries.	We enforce
+		 * that here.
+		 */
+		real_size = roundup(new_size, 4);
+		if (ifp->if_u1.if_data == NULL) {
+			ASSERT(ifp->if_real_bytes == 0);
+			ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
+		} else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
+			/*
+			 * Only do the realloc if the underlying size
+			 * is really changing.
+			 */
+			if (ifp->if_real_bytes != real_size) {
+				ifp->if_u1.if_data =
+					kmem_realloc(ifp->if_u1.if_data,
+							real_size,
+							ifp->if_real_bytes,
+							KM_SLEEP);
+			}
+		} else {
+			ASSERT(ifp->if_real_bytes == 0);
+			ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
+			bcopy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
+			      ifp->if_bytes);
+		}
+	}
+	ifp->if_real_bytes = real_size;
+	ifp->if_bytes = new_size;
+	ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+}
+
+
+
+
+/*
+ * Map inode to disk block and offset.
+ *
+ * mp -- the mount point structure for the current file system
+ * tp -- the current transaction
+ * ino -- the inode number of the inode to be located
+ * imap -- this structure is filled in with the information necessary
+ *	 to retrieve the given inode from disk
+ * flags -- flags to pass to xfs_dilocate indicating whether or not
+ *	 lookups in the inode btree were OK or not
+ */
+int
+xfs_imap(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_ino_t	ino,
+	xfs_imap_t	*imap,
+	uint		flags)
+{
+	xfs_fsblock_t	fsbno;
+	int		len;
+	int		off;
+	int		error;
+
+	fsbno = imap->im_blkno ?
+		XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK;
+	error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags);
+	if (error != 0) {
+		return error;
+	}
+	imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno);
+	imap->im_len = XFS_FSB_TO_BB(mp, len);
+	imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno);
+	imap->im_ioffset = (ushort)off;
+	imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog);
+	return 0;
+}
+
+void
+xfs_idestroy_fork(
+	xfs_inode_t	*ip,
+	int		whichfork)
+{
+	xfs_ifork_t	*ifp;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (ifp->if_broot != NULL) {
+		kmem_free(ifp->if_broot, ifp->if_broot_bytes);
+		ifp->if_broot = NULL;
+	}
+
+	/*
+	 * If the format is local, then we can't have an extents
+	 * array so just look for an inline data array.	 If we're
+	 * not local then we may or may not have an extents list,
+	 * so check and free it up if we do.
+	 */
+	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+		if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
+		    (ifp->if_u1.if_data != NULL)) {
+			ASSERT(ifp->if_real_bytes != 0);
+			kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
+			ifp->if_u1.if_data = NULL;
+			ifp->if_real_bytes = 0;
+		}
+	} else if ((ifp->if_flags & XFS_IFEXTENTS) &&
+		   (ifp->if_u1.if_extents != NULL) &&
+		   (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)) {
+		ASSERT(ifp->if_real_bytes != 0);
+		kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
+		ifp->if_u1.if_extents = NULL;
+		ifp->if_real_bytes = 0;
+	}
+	ASSERT(ifp->if_u1.if_extents == NULL ||
+	       ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
+	ASSERT(ifp->if_real_bytes == 0);
+	if (whichfork == XFS_ATTR_FORK) {
+		kmem_zone_free(xfs_ifork_zone, ip->i_afp);
+		ip->i_afp = NULL;
+	}
+}
+
+/*
+ * This is called free all the memory associated with an inode.
+ * It must free the inode itself and any buffers allocated for
+ * if_extents/if_data and if_broot.  It must also free the lock
+ * associated with the inode.
+ */
+void
+xfs_idestroy(
+	xfs_inode_t	*ip)
+{
+
+	switch (ip->i_d.di_mode & IFMT) {
+	case IFREG:
+	case IFDIR:
+	case IFLNK:
+		xfs_idestroy_fork(ip, XFS_DATA_FORK);
+		break;
+	}
+	if (ip->i_afp)
+		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+#ifdef NOTYET
+	if (ip->i_range_lock.r_sleep != NULL) {
+		freesema(ip->i_range_lock.r_sleep);
+		kmem_free(ip->i_range_lock.r_sleep, sizeof(sema_t));
+	}
+#endif /* NOTYET */
+	mrfree(&ip->i_lock);
+	mrfree(&ip->i_iolock);
+#ifdef NOTYET
+	mutex_destroy(&ip->i_range_lock.r_spinlock);
+#endif /* NOTYET */
+	freesema(&ip->i_flock);
+#ifdef XFS_BMAP_TRACE
+	ktrace_free(ip->i_xtrace);
+#endif
+#ifdef XFS_BMBT_TRACE
+	ktrace_free(ip->i_btrace);
+#endif
+#ifdef XFS_RW_TRACE
+	ktrace_free(ip->i_rwtrace);
+#endif
+#ifdef XFS_STRAT_TRACE
+	ktrace_free(ip->i_strat_trace);
+#endif
+#ifdef XFS_ILOCK_TRACE
+	ktrace_free(ip->i_lock_trace);
+#endif
+#ifdef XFS_DIR2_TRACE
+	ktrace_free(ip->i_dir_trace);
+#endif
+	if (ip->i_itemp) {
+		/* XXXdpd should be able to assert this but shutdown
+		 * is leaving the AIL behind. */
+		ASSERT(((ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL) == 0) ||
+		       XFS_FORCED_SHUTDOWN(ip->i_mount));
+		xfs_inode_item_destroy(ip);
+	}
+	kmem_zone_free(xfs_inode_zone, ip);
+}
+
+
+/*
+ * Increment the pin count of the given buffer.
+ * This value is protected by ipinlock spinlock in the mount structure.
+ */
+void
+xfs_ipin(
+	xfs_inode_t	*ip)
+{
+	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+
+	atomic_inc(&ip->i_pincount);
+}
+
+/*
+ * Decrement the pin count of the given inode, and wake up
+ * anyone in xfs_iwait_unpin() if the count goes to 0.	The
+ * inode must have been previoulsy pinned with a call to xfs_ipin().
+ */
+void
+xfs_iunpin(
+	xfs_inode_t	*ip)
+{
+	ASSERT(atomic_read(&ip->i_pincount) > 0);
+
+	if (atomic_dec_and_test(&ip->i_pincount)) {
+		wake_up(&ip->i_ipin_wait);
+	}
+}
+
+/*
+ * This is called to wait for the given inode to be unpinned.
+ * It will sleep until this happens.  The caller must have the
+ * inode locked in at least shared mode so that the buffer cannot
+ * be subsequently pinned once someone is waiting for it to be
+ * unpinned.
+ */
+void
+xfs_iunpin_wait(
+	xfs_inode_t	*ip)
+{
+	xfs_inode_log_item_t	*iip;
+	xfs_lsn_t	lsn;
+
+	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS));
+
+	if (atomic_read(&ip->i_pincount) == 0) {
+		return;
+	}
+
+	iip = ip->i_itemp;
+	if (iip && iip->ili_last_lsn) {
+		lsn = iip->ili_last_lsn;
+	} else {
+		lsn = (xfs_lsn_t)0;
+	}
+
+	/*
+	 * Give the log a push so we don't wait here too long.
+	 */
+	xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE);
+
+	wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
+}
+
+
+/*
+ * xfs_iextents_copy()
+ *
+ * This is called to copy the REAL extents (as opposed to the delayed
+ * allocation extents) from the inode into the given buffer.  It
+ * returns the number of bytes copied into the buffer.
+ *
+ * If there are no delayed allocation extents, then we can just
+ * bcopy() the extents into the buffer.	 Otherwise, we need to
+ * examine each extent in turn and skip those which are delayed.
+ */
+int
+xfs_iextents_copy(
+	xfs_inode_t		*ip,
+	xfs_bmbt_rec_32_t	*buffer,
+	int			whichfork)
+{
+	int			copied;
+	xfs_bmbt_rec_32_t	*dest_ep;
+	xfs_bmbt_rec_t		*ep;
+#ifdef XFS_BMAP_TRACE
+	static char		fname[] = "xfs_iextents_copy";
+#endif
+	int			i;
+	xfs_ifork_t		*ifp;
+	int			nrecs;
+	xfs_fsblock_t		start_block;
+
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
+	ASSERT(ifp->if_bytes > 0);
+
+	nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	xfs_bmap_trace_exlist(fname, ip, nrecs, whichfork);
+	ASSERT(nrecs > 0);
+	if (nrecs == XFS_IFORK_NEXTENTS(ip, whichfork)) {
+		/*
+		 * There are no delayed allocation extents,
+		 * so just copy everything.
+		 */
+		ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+		ASSERT(ifp->if_bytes ==
+		       (XFS_IFORK_NEXTENTS(ip, whichfork) *
+			(uint)sizeof(xfs_bmbt_rec_t)));
+		bcopy(ifp->if_u1.if_extents, buffer, ifp->if_bytes);
+		xfs_validate_extents(buffer, nrecs, XFS_EXTFMT_INODE(ip));
+		return ifp->if_bytes;
+	}
+
+	ASSERT(whichfork == XFS_DATA_FORK);
+	/*
+	 * There are some delayed allocation extents in the
+	 * inode, so copy the extents one at a time and skip
+	 * the delayed ones.  There must be at least one
+	 * non-delayed extent.
+	 */
+	ASSERT(nrecs > ip->i_d.di_nextents);
+	ep = ifp->if_u1.if_extents;
+	dest_ep = buffer;
+	copied = 0;
+	for (i = 0; i < nrecs; i++) {
+		start_block = xfs_bmbt_get_startblock(ep);
+		if (ISNULLSTARTBLOCK(start_block)) {
+			/*
+			 * It's a delayed allocation extent, so skip it.
+			 */
+			ep++;
+			continue;
+		}
+
+		*dest_ep = *(xfs_bmbt_rec_32_t *)ep;
+		dest_ep++;
+		ep++;
+		copied++;
+	}
+	ASSERT(copied != 0);
+	ASSERT(copied == ip->i_d.di_nextents);
+	ASSERT((copied * (uint)sizeof(xfs_bmbt_rec_t)) <= XFS_IFORK_DSIZE(ip));
+	xfs_validate_extents(buffer, copied, XFS_EXTFMT_INODE(ip));
+
+	return (copied * (uint)sizeof(xfs_bmbt_rec_t));
+}
+
+/*
+ * Each of the following cases stores data into the same region
+ * of the on-disk inode, so only one of them can be valid at
+ * any given time. While it is possible to have conflicting formats
+ * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
+ * in EXTENTS format, this can only happen when the fork has
+ * changed formats after being modified but before being flushed.
+ * In these cases, the format always takes precedence, because the
+ * format indicates the current state of the fork.
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_iflush_fork(
+	xfs_inode_t		*ip,
+	xfs_dinode_t		*dip,
+	xfs_inode_log_item_t	*iip,
+	int			whichfork,
+	xfs_buf_t		*bp)
+{
+	char			*cp;
+	xfs_ifork_t		*ifp;
+	xfs_mount_t		*mp;
+#ifdef XFS_TRANS_DEBUG
+	int			first;
+#endif
+	static const short	brootflag[2] =
+		{ XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
+	static const short	dataflag[2] =
+		{ XFS_ILOG_DDATA, XFS_ILOG_ADATA };
+	static const short	extflag[2] =
+		{ XFS_ILOG_DEXT, XFS_ILOG_AEXT };
+
+	if (iip == NULL)
+		return 0;
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	/*
+	 * This can happen if we gave up in iformat in an error path,
+	 * for the attribute fork.
+	 */
+	if (ifp == NULL) {
+		ASSERT(whichfork == XFS_ATTR_FORK);
+		return 0;
+	}
+	cp = XFS_DFORK_PTR_ARCH(dip, whichfork, ARCH_CONVERT);
+	mp = ip->i_mount;
+	switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+	case XFS_DINODE_FMT_LOCAL:
+		if ((iip->ili_format.ilf_fields & dataflag[whichfork]) &&
+		    (ifp->if_bytes > 0)) {
+			ASSERT(ifp->if_u1.if_data != NULL);
+			ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
+			bcopy(ifp->if_u1.if_data, cp, ifp->if_bytes);
+		}
+		if (whichfork == XFS_DATA_FORK) {
+			if (XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp, dip)) {
+				return XFS_ERROR(EFSCORRUPTED);
+			}
+		}
+		break;
+
+	case XFS_DINODE_FMT_EXTENTS:
+		ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
+		       !(iip->ili_format.ilf_fields & extflag[whichfork]));
+		ASSERT((ifp->if_u1.if_extents != NULL) || (ifp->if_bytes == 0));
+		ASSERT((ifp->if_u1.if_extents == NULL) || (ifp->if_bytes > 0));
+		if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
+		    (ifp->if_bytes > 0)) {
+			ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
+			(void)xfs_iextents_copy(ip, (xfs_bmbt_rec_32_t *)cp,
+				whichfork);
+		}
+		break;
+
+	case XFS_DINODE_FMT_BTREE:
+		if ((iip->ili_format.ilf_fields & brootflag[whichfork]) &&
+		    (ifp->if_broot_bytes > 0)) {
+			ASSERT(ifp->if_broot != NULL);
+			ASSERT(ifp->if_broot_bytes <=
+			       (XFS_IFORK_SIZE(ip, whichfork) +
+				XFS_BROOT_SIZE_ADJ));
+			xfs_bmbt_to_bmdr(ifp->if_broot, ifp->if_broot_bytes,
+				(xfs_bmdr_block_t *)cp,
+				XFS_DFORK_SIZE_ARCH(dip, mp, whichfork, ARCH_CONVERT));
+		}
+		break;
+
+	case XFS_DINODE_FMT_DEV:
+		if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
+			ASSERT(whichfork == XFS_DATA_FORK);
+			INT_SET(dip->di_u.di_dev, ARCH_CONVERT, ip->i_df.if_u2.if_rdev);
+		}
+		break;
+
+	case XFS_DINODE_FMT_UUID:
+		if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
+			ASSERT(whichfork == XFS_DATA_FORK);
+			bcopy(&ip->i_df.if_u2.if_uuid, &dip->di_u.di_muuid,
+				sizeof(uuid_t));
+		}
+		break;
+
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	return 0;
+}
+
+/*
+ * xfs_iflush() will write a modified inode's changes out to the
+ * inode's on disk home.  The caller must have the inode lock held
+ * in at least shared mode and the inode flush semaphore must be
+ * held as well.  The inode lock will still be held upon return from
+ * the call and the caller is free to unlock it.
+ * The inode flush lock will be unlocked when the inode reaches the disk.
+ * The flags indicate how the inode's buffer should be written out.
+ */
+int
+xfs_iflush(
+	xfs_inode_t		*ip,
+	uint			flags)
+{
+	xfs_inode_log_item_t	*iip;
+	xfs_buf_t		*bp;
+	xfs_dinode_t		*dip;
+	xfs_mount_t		*mp;
+	int			error;
+	/* REFERENCED */
+	xfs_chash_t		*ch;
+	xfs_inode_t		*iq;
+	int			clcount;	/* count of inodes clustered */
+	int			bufwasdelwri;
+	enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
+	SPLDECL(s);
+
+	XFS_STATS_INC(xfsstats.xs_iflush_count);
+
+	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
+	ASSERT(valusema(&ip->i_flock) <= 0);
+	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
+
+	iip = ip->i_itemp;
+	mp = ip->i_mount;
+
+	/*
+	 * If the inode isn't dirty, then just release the inode
+	 * flush lock and do nothing.
+	 */
+	if ((ip->i_update_core == 0) &&
+	    ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
+		ASSERT((iip != NULL) ?
+			 !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1);
+		xfs_ifunlock(ip);
+		return 0;
+	}
+
+	/*
+	 * We can't flush the inode until it is unpinned, so
+	 * wait for it.	 We know noone new can pin it, because
+	 * we are holding the inode lock shared and you need
+	 * to hold it exclusively to pin the inode.
+	 */
+	xfs_iunpin_wait(ip);
+
+	/*
+	 * This may have been unpinned because the filesystem is shutting
+	 * down forcibly. If that's the case we must not write this inode
+	 * to disk, because the log record didn't make it to disk!
+	 */
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		ip->i_update_core = 0;
+		if (iip)
+			iip->ili_format.ilf_fields = 0;
+		xfs_ifunlock(ip);
+		return XFS_ERROR(EIO);
+	}
+
+	/*
+	 * Get the buffer containing the on-disk inode.
+	 */
+	error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0);
+	if (error != 0) {
+		xfs_ifunlock(ip);
+		return error;
+	}
+
+	/*
+	 * Decide how buffer will be flushed out.  This is done before
+	 * the call to xfs_iflush_int because this field is zeroed by it.
+	 */
+	if (iip != NULL && iip->ili_format.ilf_fields != 0) {
+		/*
+		 * Flush out the inode buffer according to the directions
+		 * of the caller.  In the cases where the caller has given
+		 * us a choice choose the non-delwri case.  This is because
+		 * the inode is in the AIL and we need to get it out soon.
+		 */
+		switch (flags) {
+		case XFS_IFLUSH_SYNC:
+		case XFS_IFLUSH_DELWRI_ELSE_SYNC:
+			flags = 0;
+			break;
+		case XFS_IFLUSH_ASYNC:
+		case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
+			flags = INT_ASYNC;
+			break;
+		case XFS_IFLUSH_DELWRI:
+			flags = INT_DELWRI;
+			break;
+		default:
+			ASSERT(0);
+			flags = 0;
+			break;
+		}
+	} else {
+		switch (flags) {
+		case XFS_IFLUSH_DELWRI_ELSE_SYNC:
+		case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
+		case XFS_IFLUSH_DELWRI:
+			flags = INT_DELWRI;
+			break;
+		case XFS_IFLUSH_ASYNC:
+			flags = INT_ASYNC;
+			break;
+		case XFS_IFLUSH_SYNC:
+			flags = 0;
+			break;
+		default:
+			ASSERT(0);
+			flags = 0;
+			break;
+		}
+	}
+
+	/*
+	 * First flush out the inode that xfs_iflush was called with.
+	 */
+	error = xfs_iflush_int(ip, bp);
+	if (error) {
+		goto corrupt_out;
+	}
+
+	/*
+	 * inode clustering:
+	 * see if other inodes can be gathered into this write
+	 */
+
+#ifdef DEBUG
+	ip->i_chash->chl_buf = bp;		/* inode clustering debug */
+#endif
+
+	ch = XFS_CHASH(mp, ip->i_blkno);
+	s = mutex_spinlock(&ch->ch_lock);
+
+	clcount = 0;
+	for (iq = ip->i_cnext; iq != ip; iq = iq->i_cnext) {
+		/*
+		 * Do an un-protected check to see if the inode is dirty and
+		 * is a candidate for flushing.	 These checks will be repeated
+		 * later after the appropriate locks are acquired.
+		 */
+		iip = iq->i_itemp;
+		if ((iq->i_update_core == 0) &&
+		    ((iip == NULL) ||
+		     !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
+		      xfs_ipincount(iq) == 0) {
+			continue;
+		}
+
+		/*
+		 * Try to get locks.  If any are unavailable,
+		 * then this inode cannot be flushed and is skipped.
+		 */
+
+		/* get inode locks (just i_lock) */
+		if (xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) {
+			/* get inode flush lock */
+			if (xfs_iflock_nowait(iq)) {
+				/* check if pinned */
+				if (xfs_ipincount(iq) == 0) {
+					/* arriving here means that
+					 * this inode can be flushed.
+					 * first re-check that it's
+					 * dirty
+					 */
+					iip = iq->i_itemp;
+					if ((iq->i_update_core != 0)||
+					    ((iip != NULL) &&
+					     (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
+						clcount++;
+						error = xfs_iflush_int(iq, bp);
+						if (error) {
+							xfs_iunlock(iq,
+								    XFS_ILOCK_SHARED);
+							goto cluster_corrupt_out;
+						}
+					} else {
+						xfs_ifunlock(iq);
+					}
+				} else {
+					xfs_ifunlock(iq);
+				}
+			}
+			xfs_iunlock(iq, XFS_ILOCK_SHARED);
+		}
+	}
+	mutex_spinunlock(&ch->ch_lock, s);
+
+	if (clcount) {
+		XFS_STATS_INC(xfsstats.xs_icluster_flushcnt);
+		XFS_STATS_ADD(xfsstats.xs_icluster_flushinode, clcount);
+	}
+
+	/*
+	 * If the buffer is pinned then push on the log so we won't
+	 * get stuck waiting in the write for too long.
+	 */
+	if (XFS_BUF_ISPINNED(bp)){
+		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+	}
+
+	if (flags & INT_DELWRI) {
+		xfs_bdwrite(mp, bp);
+	} else if (flags & INT_ASYNC) {
+		xfs_bawrite(mp, bp);
+	} else {
+		error = xfs_bwrite(mp, bp);
+	}
+	return error;
+
+corrupt_out:
+	xfs_buf_relse(bp);
+	xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+	xfs_iflush_abort(ip);
+	/*
+	 * Unlocks the flush lock
+	 */
+	return XFS_ERROR(EFSCORRUPTED);
+
+cluster_corrupt_out:
+	/* Corruption detected in the clustering loop.	Invalidate the
+	 * inode buffer and shut down the filesystem.
+	 */
+	mutex_spinunlock(&ch->ch_lock, s);
+
+	/*
+	 * Clean up the buffer.	 If it was B_DELWRI, just release it --
+	 * brelse can handle it with no problems.  If not, shut down the
+	 * filesystem before releasing the buffer.
+	 */
+	if ((bufwasdelwri= XFS_BUF_ISDELAYWRITE(bp))) {
+		xfs_buf_relse(bp);
+	}
+
+	xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+
+	if(!bufwasdelwri)  {
+		/*
+		 * Just like incore_relse: if we have b_iodone functions,
+		 * mark the buffer as an error and call them.  Otherwise
+		 * mark it as stale and brelse.
+		 */
+		if (XFS_BUF_IODONE_FUNC(bp)) {
+			XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+			XFS_BUF_UNDONE(bp);
+			XFS_BUF_STALE(bp);
+			XFS_BUF_SHUT(bp);
+			XFS_BUF_ERROR(bp,EIO);
+			xfs_biodone(bp);
+		} else {
+			XFS_BUF_STALE(bp);
+			xfs_buf_relse(bp);
+		}
+	}
+
+	xfs_iflush_abort(iq);
+	/*
+	 * Unlocks the flush lock
+	 */
+	return XFS_ERROR(EFSCORRUPTED);
+}
+
+
+STATIC int
+xfs_iflush_int(
+	xfs_inode_t		*ip,
+	xfs_buf_t		*bp)
+{
+	xfs_inode_log_item_t	*iip;
+	xfs_dinode_t		*dip;
+	xfs_mount_t		*mp;
+#ifdef XFS_TRANS_DEBUG
+	int			first;
+#endif
+	SPLDECL(s);
+
+	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
+	ASSERT(valusema(&ip->i_flock) <= 0);
+	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
+
+	iip = ip->i_itemp;
+	mp = ip->i_mount;
+
+
+	/*
+	 * If the inode isn't dirty, then just release the inode
+	 * flush lock and do nothing.
+	 */
+	if ((ip->i_update_core == 0) &&
+	    ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
+		xfs_ifunlock(ip);
+		return 0;
+	}
+
+	/* set *dip = inode's place in the buffer */
+	dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_boffset);
+
+	/*
+	 * Clear i_update_core before copying out the data.
+	 * This is for coordination with our timestamp updates
+	 * that don't hold the inode lock. They will always
+	 * update the timestamps BEFORE setting i_update_core,
+	 * so if we clear i_update_core after they set it we
+	 * are guaranteed to see their updates to the timestamps.
+	 * I believe that this depends on strongly ordered memory
+	 * semantics, but we have that.	 We use the SYNCHRONIZE
+	 * macro to make sure that the compiler does not reorder
+	 * the i_update_core access below the data copy below.
+	 */
+	ip->i_update_core = 0;
+	SYNCHRONIZE();
+
+	if (XFS_TEST_ERROR(INT_GET(dip->di_core.di_magic,ARCH_CONVERT) != XFS_DINODE_MAGIC,
+			       mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
+		xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+		    "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p",
+			ip->i_ino, (int) INT_GET(dip->di_core.di_magic, ARCH_CONVERT), dip);
+		goto corrupt_out;
+	}
+	if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
+				mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
+		xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+			"xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
+			ip->i_ino, ip, ip->i_d.di_magic);
+		goto corrupt_out;
+	}
+	if ((ip->i_d.di_mode & IFMT) == IFREG) {
+		if (XFS_TEST_ERROR(
+		    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
+		    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
+		    mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
+			xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+				"xfs_iflush: Bad regular inode %Lu, ptr 0x%p",
+				ip->i_ino, ip);
+			goto corrupt_out;
+		}
+	} else if ((ip->i_d.di_mode & IFMT) == IFDIR) {
+		if (XFS_TEST_ERROR(
+		    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
+		    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
+		    (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
+		    mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
+			xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+				"xfs_iflush: Bad directory inode %Lu, ptr 0x%p",
+				ip->i_ino, ip);
+			goto corrupt_out;
+		}
+	}
+	if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
+				ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
+				XFS_RANDOM_IFLUSH_5)) {
+		xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+			"xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p",
+			ip->i_ino,
+			ip->i_d.di_nextents + ip->i_d.di_anextents,
+			ip->i_d.di_nblocks,
+			ip);
+		goto corrupt_out;
+	}
+	if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
+				mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
+		xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
+			"xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
+			ip->i_ino, ip->i_d.di_forkoff, ip);
+		goto corrupt_out;
+	}
+	/*
+	 * Copy the dirty parts of the inode into the on-disk
+	 * inode.  We always copy out the core of the inode,
+	 * because if the inode is dirty at all the core must
+	 * be.
+	 */
+	xfs_xlate_dinode_core((xfs_caddr_t)&(dip->di_core), &(ip->i_d),
+		-1, ARCH_CONVERT);
+
+	/*
+	 * If this is really an old format inode and the superblock version
+	 * has not been updated to support only new format inodes, then
+	 * convert back to the old inode format.  If the superblock version
+	 * has been updated, then make the conversion permanent.
+	 */
+	ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
+	       XFS_SB_VERSION_HASNLINK(&mp->m_sb));
+	if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+		if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
+			/*
+			 * Convert it back.
+			 */
+			ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
+			INT_SET(dip->di_core.di_onlink, ARCH_CONVERT, ip->i_d.di_nlink);
+		} else {
+			/*
+			 * The superblock version has already been bumped,
+			 * so just make the conversion to the new inode
+			 * format permanent.
+			 */
+			ip->i_d.di_version = XFS_DINODE_VERSION_2;
+			INT_SET(dip->di_core.di_version, ARCH_CONVERT, XFS_DINODE_VERSION_2);
+			ip->i_d.di_onlink = 0;
+			INT_ZERO(dip->di_core.di_onlink, ARCH_CONVERT);
+			bzero(&(ip->i_d.di_pad[0]), sizeof(ip->i_d.di_pad));
+			bzero(&(dip->di_core.di_pad[0]),
+			      sizeof(dip->di_core.di_pad));
+			ASSERT(ip->i_d.di_projid == 0);
+		}
+	}
+
+	if (xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp) == EFSCORRUPTED) {
+		goto corrupt_out;
+	}
+
+	if (XFS_IFORK_Q(ip)) {
+		/*
+		 * The only error from xfs_iflush_fork is on the data fork.
+		 */
+		(void) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
+	}
+	xfs_inobp_check(mp, bp);
+
+	/*
+	 * We've recorded everything logged in the inode, so we'd
+	 * like to clear the ilf_fields bits so we don't log and
+	 * flush things unnecessarily.	However, we can't stop
+	 * logging all this information until the data we've copied
+	 * into the disk buffer is written to disk.  If we did we might
+	 * overwrite the copy of the inode in the log with all the
+	 * data after re-logging only part of it, and in the face of
+	 * a crash we wouldn't have all the data we need to recover.
+	 *
+	 * What we do is move the bits to the ili_last_fields field.
+	 * When logging the inode, these bits are moved back to the
+	 * ilf_fields field.  In the xfs_iflush_done() routine we
+	 * clear ili_last_fields, since we know that the information
+	 * those bits represent is permanently on disk.	 As long as
+	 * the flush completes before the inode is logged again, then
+	 * both ilf_fields and ili_last_fields will be cleared.
+	 *
+	 * We can play with the ilf_fields bits here, because the inode
+	 * lock must be held exclusively in order to set bits there
+	 * and the flush lock protects the ili_last_fields bits.
+	 * Set ili_logged so the flush done
+	 * routine can tell whether or not to look in the AIL.
+	 * Also, store the current LSN of the inode so that we can tell
+	 * whether the item has moved in the AIL from xfs_iflush_done().
+	 * In order to read the lsn we need the AIL lock, because
+	 * it is a 64 bit value that cannot be read atomically.
+	 */
+	if (iip != NULL && iip->ili_format.ilf_fields != 0) {
+		iip->ili_last_fields = iip->ili_format.ilf_fields;
+		iip->ili_format.ilf_fields = 0;
+		iip->ili_logged = 1;
+
+		ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */
+		AIL_LOCK(mp,s);
+		iip->ili_flush_lsn = iip->ili_item.li_lsn;
+		AIL_UNLOCK(mp, s);
+
+		/*
+		 * Attach the function xfs_iflush_done to the inode's
+		 * buffer.  This will remove the inode from the AIL
+		 * and unlock the inode's flush lock when the inode is
+		 * completely written to disk.
+		 */
+		xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*))
+				      xfs_iflush_done, (xfs_log_item_t *)iip);
+
+		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+		ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL);
+	} else {
+		/*
+		 * We're flushing an inode which is not in the AIL and has
+		 * not been logged but has i_update_core set.  For this
+		 * case we can use a B_DELWRI flush and immediately drop
+		 * the inode flush lock because we can avoid the whole
+		 * AIL state thing.  It's OK to drop the flush lock now,
+		 * because we've already locked the buffer and to do anything
+		 * you really need both.
+		 */
+		if (iip != NULL) {
+			ASSERT(iip->ili_logged == 0);
+			ASSERT(iip->ili_last_fields == 0);
+			ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0);
+		}
+		xfs_ifunlock(ip);
+	}
+
+	return 0;
+
+corrupt_out:
+	return XFS_ERROR(EFSCORRUPTED);
+}
+
+/*
+ * Flush all inactive inodes in mp.  Return true if no user references
+ * were found, false otherwise.
+ */
+int
+xfs_iflush_all(
+	xfs_mount_t	*mp,
+	int		flag)
+{
+	int		busy;
+	int		done;
+	int		purged;
+	xfs_inode_t	*ip;
+	vmap_t		vmap;
+	vnode_t		*vp;
+
+	busy = done = 0;
+	while (!done) {
+		purged = 0;
+		XFS_MOUNT_ILOCK(mp);
+		ip = mp->m_inodes;
+		if (ip == NULL) {
+			break;
+		}
+		do {
+			/* Make sure we skip markers inserted by sync */
+			if (ip->i_mount == NULL) {
+				ip = ip->i_mnext;
+				continue;
+			}
+
+			/*
+			 * It's up to our caller to purge the root
+			 * and quota vnodes later.
+			 */
+			vp = XFS_ITOV_NULL(ip);
+
+			if (!vp) {
+				XFS_MOUNT_IUNLOCK(mp);
+				xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC);
+				purged = 1;
+				break;
+			}
+
+			if (vn_count(vp) != 0) {
+				if (vn_count(vp) == 1 &&
+				    (ip == mp->m_rootip ||
+				     (mp->m_quotainfo &&
+				      (ip->i_ino == mp->m_sb.sb_uquotino ||
+				       ip->i_ino == mp->m_sb.sb_gquotino)))) {
+
+					ip = ip->i_mnext;
+					continue;
+				}
+				if (!(flag & XFS_FLUSH_ALL)) {
+					ASSERT(0);
+					busy = 1;
+					done = 1;
+					break;
+				}
+				/*
+				 * Ignore busy inodes but continue flushing
+				 * others.
+				 */
+				ip = ip->i_mnext;
+				continue;
+			}
+			/*
+			 * Sample vp mapping while holding mp locked on MP
+			 * systems, so we don't purge a reclaimed or
+			 * nonexistent vnode.  We break from the loop
+			 * since we know that we modify
+			 * it by pulling ourselves from it in xfs_reclaim()
+			 * called via vn_purge() below.	 Set ip to the next
+			 * entry in the list anyway so we'll know below
+			 * whether we reached the end or not.
+			 */
+			VMAP(vp, ip, vmap);
+			vp->v_flag |= VPURGE;		/* OK for vn_purge */
+			XFS_MOUNT_IUNLOCK(mp);
+
+			vn_purge(vp, &vmap);
+
+			purged = 1;
+			break;
+		} while (ip != mp->m_inodes);
+		/*
+		 * We need to distinguish between when we exit the loop
+		 * after a purge and when we simply hit the end of the
+		 * list.  We can't use the (ip == mp->m_inodes) test,
+		 * because when we purge an inode at the start of the list
+		 * the next inode on the list becomes mp->m_inodes.  That
+		 * would cause such a test to bail out early.  The purged
+		 * variable tells us how we got out of the loop.
+		 */
+		if (!purged) {
+			done = 1;
+		}
+	}
+	XFS_MOUNT_IUNLOCK(mp);
+	return !busy;
+}
+
+
+/*
+ * xfs_iaccess: check accessibility of inode for mode.
+ */
+int
+xfs_iaccess(
+	xfs_inode_t	*ip,
+	mode_t		mode,
+	cred_t		*cr)
+{
+	int		error;
+	mode_t		orgmode = mode;
+	struct inode	*inode = LINVFS_GET_IP(XFS_ITOV(ip));
+
+	/*
+	 * Verify that the MAC policy allows the requested access.
+	 */
+	if ((error = _MAC_XFS_IACCESS(ip, mode, cr)))
+		return XFS_ERROR(error);
+
+	if (mode & IWRITE) {
+		umode_t		imode = inode->i_mode;
+
+		if (IS_RDONLY(inode) &&
+		    (S_ISREG(imode) || S_ISDIR(imode) || S_ISLNK(imode)))
+			return XFS_ERROR(EROFS);
+	}
+
+	/*
+	 * If there's an Access Control List it's used instead of
+	 * the mode bits.
+	 */
+	if ((error = _ACL_XFS_IACCESS(ip, mode, cr)) != -1)
+		return error ? XFS_ERROR(error) : 0;
+
+	if (current->fsuid != ip->i_d.di_uid) {
+		mode >>= 3;
+		if (!in_group_p((gid_t)ip->i_d.di_gid))
+			mode >>= 3;
+	}
+
+	/*
+	 * If the DACs are ok we don't need any capability check.
+	 */
+	if ((ip->i_d.di_mode & mode) == mode)
+		return 0;
+	/*
+	 * Read/write DACs are always overridable.
+	 * Executable DACs are overridable if at least one exec bit is set.
+	 */
+	if ((orgmode & (IREAD|IWRITE)) || (inode->i_mode & S_IXUGO))
+		if (capable_cred(cr, CAP_DAC_OVERRIDE))
+			return 0;
+
+	if ((orgmode == IREAD) ||
+	    (((ip->i_d.di_mode & IFMT) == IFDIR) &&
+	     (!(orgmode & ~(IWRITE|IEXEC))))) {
+		if (capable_cred(cr, CAP_DAC_READ_SEARCH))
+			return 0;
+#ifdef	NOISE
+		cmn_err(CE_NOTE, "Ick: mode=%o, orgmode=%o", mode, orgmode);
+#endif	/* NOISE */
+		return XFS_ERROR(EACCES);
+	}
+	return XFS_ERROR(EACCES);
+}
+
+/*
+ * Return whether or not it is OK to swap to the given file in the
+ * given range.	 Return 0 for OK and otherwise return the error.
+ *
+ * It is only OK to swap to a file if it has no holes, and all
+ * extents have been initialized.
+ *
+ * We use the vnode behavior chain prevent and allow primitives
+ * to ensure that the vnode chain stays coherent while we do this.
+ * This allows us to walk the chain down to the bottom where XFS
+ * lives without worrying about it changing out from under us.
+ */
+int
+xfs_swappable(
+	bhv_desc_t	*bdp)
+{
+	xfs_inode_t	*ip;
+
+	ip = XFS_BHVTOI(bdp);
+	/*
+	 * Verify that the file does not have any
+	 * holes or unwritten exents.
+	 */
+	return xfs_bmap_check_swappable(ip);
+}
+
+/*
+ * xfs_iroundup: round up argument to next power of two
+ */
+uint
+xfs_iroundup(
+	uint	v)
+{
+	int i;
+	uint m;
+
+	if ((v & (v - 1)) == 0)
+		return v;
+	ASSERT((v & 0x80000000) == 0);
+	if ((v & (v + 1)) == 0)
+		return v + 1;
+	for (i = 0, m = 1; i < 31; i++, m <<= 1) {
+		if (v & m)
+			continue;
+		v |= m;
+		if ((v & (v + 1)) == 0)
+			return v + 1;
+	}
+	ASSERT(0);
+	return( 0 );
+}
+
+/*
+ * Change the requested timestamp in the given inode.
+ * We don't lock across timestamp updates, and we don't log them but
+ * we do record the fact that there is dirty information in core.
+ *
+ * NOTE -- callers MUST combine XFS_ICHGTIME_MOD or XFS_ICHGTIME_CHG
+ *		with XFS_ICHGTIME_ACC to be sure that access time
+ *		update will take.  Calling first with XFS_ICHGTIME_ACC
+ *		and then XFS_ICHGTIME_MOD may fail to modify the access
+ *		timestamp if the filesystem is mounted noacctm.
+ */
+void
+xfs_ichgtime(xfs_inode_t *ip,
+	     int flags)
+{
+	timespec_t	tv;
+	vnode_t		*vp = XFS_ITOV(ip);
+	struct inode	*inode = LINVFS_GET_IP(vp);
+
+	/*
+	 * We're not supposed to change timestamps in readonly-mounted
+	 * filesystems.	 Throw it away if anyone asks us.
+	 */
+	if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
+		return;
+
+	/*
+	 * Don't update access timestamps on reads if mounted "noatime"
+	 * Throw it away if anyone asks us.
+	 */
+	if (ip->i_mount->m_flags & XFS_MOUNT_NOATIME &&
+	    ((flags & (XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD|XFS_ICHGTIME_CHG))
+			== XFS_ICHGTIME_ACC))
+		return;
+
+	nanotime(&tv);
+	if (flags & XFS_ICHGTIME_MOD) {
+		inode->i_mtime = ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
+		ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
+	}
+	if (flags & XFS_ICHGTIME_ACC) {
+		inode->i_atime = ip->i_d.di_atime.t_sec = (__int32_t)tv.tv_sec;
+		ip->i_d.di_atime.t_nsec = (__int32_t)tv.tv_nsec;
+	}
+	if (flags & XFS_ICHGTIME_CHG) {
+		inode->i_ctime = ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec;
+		ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec;
+	}
+
+	/*
+	 * We update the i_update_core field _after_ changing
+	 * the timestamps in order to coordinate properly with
+	 * xfs_iflush() so that we don't lose timestamp updates.
+	 * This keeps us from having to hold the inode lock
+	 * while doing this.  We use the SYNCHRONIZE macro to
+	 * ensure that the compiler does not reorder the update
+	 * of i_update_core above the timestamp updates above.
+	 */
+	SYNCHRONIZE();
+	ip->i_update_core = 1;
+}
+
+/*
+ * xfs_ibusy_check -- Checks whether inode reference count allows unmount
+ *
+ * The value returned is one if the reference count would prevent an unmount.
+ */
+int
+xfs_ibusy_check(
+	xfs_inode_t	*ip,
+	int		refs)
+{
+	xfs_mount_t	*mp = ip->i_mount;
+
+	if ((refs == 1) && (ip == mp->m_rootip))
+		return (0);
+	if ((refs == 1) && (ip == mp->m_rbmip))
+		return (0);
+	if ((refs == 1) && (ip == mp->m_rsumip))
+		return (0);
+	if (mp->m_quotainfo && ip->i_ino == mp->m_sb.sb_uquotino)
+		return (0);
+	if (mp->m_quotainfo && ip->i_ino == mp->m_sb.sb_gquotino)
+		return (0);
+	return (1);
+}
+
+#ifdef XFS_ILOCK_TRACE
+void
+xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra)
+{
+	ktrace_enter(ip->i_lock_trace,
+		     (void *)ip,
+		      (void *)(__psint_t)lock,		/* 1 = LOCK, 3=UNLOCK, etc */
+		     (void *)(__psint_t)lockflags,	/* XFS_ILOCK_EXCL etc */
+		     (void *)ra,			/* caller of ilock */
+		     (void *)(__psint_t)cpuid(),
+		     (void *)(__psint_t)current_pid(),
+		     0,0,0,0,0,0,0,0,0,0);
+
+}
+#endif /* ILOCK_TRACE */
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
new file mode 100644
index 000000000000..a5320cf2cea0
--- /dev/null
+++ b/fs/xfs/xfs_inode.h
@@ -0,0 +1,582 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_INODE_H__
+#define __XFS_INODE_H__
+
+/*
+ * File incore extent information, present for each of data & attr forks.
+ */
+#define XFS_INLINE_EXTS 2
+#define XFS_INLINE_DATA 32
+typedef struct xfs_ifork {
+	int			if_bytes;	/* bytes in if_u1 */
+	int			if_real_bytes;	/* bytes allocated in if_u1 */
+	xfs_bmbt_block_t	*if_broot;	/* file's incore btree root */
+	short			if_broot_bytes; /* bytes allocated for root */
+	unsigned char		if_flags;	/* per-fork flags */
+	unsigned char		if_ext_max;	/* max # of extent records */
+	xfs_extnum_t		if_lastex;	/* last if_extents used */
+	union {
+		xfs_bmbt_rec_t	*if_extents;	/* linear map file exts */
+		char		*if_data;	/* inline file data */
+	} if_u1;
+	union {
+		xfs_bmbt_rec_t	if_inline_ext[XFS_INLINE_EXTS];
+						/* very small file extents */
+		char		if_inline_data[XFS_INLINE_DATA];
+						/* very small file data */
+		xfs_dev_t	if_rdev;	/* dev number if special */
+		uuid_t		if_uuid;	/* mount point value */
+	} if_u2;
+} xfs_ifork_t;
+
+/*
+ * Flags for xfs_ichgtime().
+ */
+#define XFS_ICHGTIME_MOD	0x1	/* data fork modification timestamp */
+#define XFS_ICHGTIME_ACC	0x2	/* data fork access timestamp */
+#define XFS_ICHGTIME_CHG	0x4	/* inode field change timestamp */
+
+/*
+ * Per-fork incore inode flags.
+ */
+#define XFS_IFINLINE	0x0001	/* Inline data is read in */
+#define XFS_IFEXTENTS	0x0002	/* All extent pointers are read in */
+#define XFS_IFBROOT	0x0004	/* i_broot points to the bmap b-tree root */
+
+/*
+ * Flags for xfs_imap() and xfs_dilocate().
+ */
+#define XFS_IMAP_LOOKUP		0x1
+
+/*
+ * Maximum number of extent pointers in if_u1.if_extents.
+ */
+#define XFS_MAX_INCORE_EXTENTS	32768
+
+
+#ifdef __KERNEL__
+struct bhv_desc;
+struct cred;
+struct ktrace;
+struct vnode;
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_bmbt_irec;
+struct xfs_bmbt_block;
+struct xfs_inode;
+struct xfs_inode_log_item;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot;
+struct pm;
+
+
+/*
+ * This structure is used to communicate which extents of a file
+ * were holes when a write started from xfs_write_file() to
+ * xfs_strat_read().  This is necessary so that we can know which
+ * blocks need to be zeroed when they are read in in xfs_strat_read()
+ * if they weren\'t allocated when the buffer given to xfs_strat_read()
+ * was mapped.
+ *
+ * We keep a list of these attached to the inode.  The list is
+ * protected by the inode lock and the fact that the io lock is
+ * held exclusively by writers.
+ */
+typedef struct xfs_gap {
+	struct xfs_gap	*xg_next;
+	xfs_fileoff_t	xg_offset_fsb;
+	xfs_extlen_t	xg_count_fsb;
+} xfs_gap_t;
+
+/*
+ * This structure is used to hold common pieces of the buffer
+ * and file for xfs_dio_write and xfs_dio_read.
+ */
+typedef struct xfs_dio {
+	struct xfs_buf	*xd_bp;
+	bhv_desc_t	*xd_bdp;
+	struct xfs_inode *xd_ip;
+	struct xfs_iocore *xd_io;
+	struct cred	*xd_cr;
+	struct pm	*xd_pmp;
+	int		xd_blkalgn;
+	int		xd_ioflag;
+	xfs_off_t	xd_start;
+	size_t		xd_length;
+} xfs_dio_t;
+
+typedef struct dm_attrs_s {
+	__uint32_t	da_dmevmask;	/* DMIG event mask */
+	__uint16_t	da_dmstate;	/* DMIG state info */
+	__uint16_t	da_pad;		/* DMIG extra padding */
+} dm_attrs_t;
+
+typedef struct xfs_iocore {
+	void			*io_obj;	/* pointer to container
+						 * inode or dcxvn structure */
+	struct xfs_mount	*io_mount;	/* fs mount struct ptr */
+#ifdef DEBUG
+	mrlock_t		*io_lock;	/* inode IO lock */
+	mrlock_t		*io_iolock;	/* inode IO lock */
+#endif
+
+	/* I/O state */
+	xfs_fsize_t		io_new_size;	/* sz when write completes */
+
+	/* Miscellaneous state. */
+	unsigned int		io_flags;	/* IO related flags */
+
+	/* DMAPI state */
+	dm_attrs_t		io_dmattrs;
+
+} xfs_iocore_t;
+
+#define	       io_dmevmask     io_dmattrs.da_dmevmask
+#define	       io_dmstate      io_dmattrs.da_dmstate
+
+#define XFS_IO_INODE(io)	((xfs_inode_t *) ((io)->io_obj))
+#define XFS_IO_DCXVN(io)	((dcxvn_t *) ((io)->io_obj))
+
+/*
+ * Flags in the flags field
+ */
+
+#define XFS_IOCORE_ISXFS	0x01
+#define XFS_IOCORE_ISCXFS	0x02
+#define XFS_IOCORE_RT		0x04
+
+#define IO_IS_XFS(io)	((io)->io_flags & XFS_IOCORE_ISXFS)
+
+/*
+ * xfs_iocore prototypes
+ */
+
+extern void xfs_iocore_inode_init(struct xfs_inode *);
+extern void xfs_iocore_inode_reinit(struct xfs_inode *);
+
+
+/*
+ * This is the type used in the xfs inode hash table.
+ * An array of these is allocated for each mounted
+ * file system to hash the inodes for that file system.
+ */
+typedef struct xfs_ihash {
+	struct xfs_inode	*ih_next;	
+	rwlock_t		ih_lock;
+	uint			ih_version;
+} xfs_ihash_t;
+
+/*
+ * Inode hashing and hash bucket locking.
+ */
+#define XFS_BUCKETS(mp) (37*(mp)->m_sb.sb_agcount-1)
+#define XFS_IHASH(mp,ino) ((mp)->m_ihash + (((uint)ino) % (mp)->m_ihsize))
+
+/*
+ * This is the xfs inode cluster hash.	This hash is used by xfs_iflush to
+ * find inodes that share a cluster and can be flushed to disk at the same
+ * time.
+ */
+
+typedef struct xfs_chashlist {
+	struct xfs_chashlist	*chl_next;
+	struct xfs_inode	*chl_ip;
+	xfs_daddr_t		chl_blkno;	/* starting block number of
+						 * the cluster */
+#ifdef DEBUG
+	struct xfs_buf		*chl_buf;	/* debug: the inode buffer */
+#endif
+} xfs_chashlist_t;
+
+typedef struct xfs_chash {
+	xfs_chashlist_t		*ch_list;
+	lock_t			ch_lock;
+} xfs_chash_t;
+
+
+/*
+ * This is the xfs in-core inode structure.
+ * Most of the on-disk inode is embedded in the i_d field.
+ *
+ * The extent pointers/inline file space, however, are managed
+ * separately.	The memory for this information is pointed to by
+ * the if_u1 unions depending on the type of the data.
+ * This is used to linearize the array of extents for fast in-core
+ * access.  This is used until the file's number of extents
+ * surpasses XFS_MAX_INCORE_EXTENTS, at which point all extent pointers
+ * are accessed through the buffer cache.
+ *
+ * Other state kept in the in-core inode is used for identification,
+ * locking, transactional updating, etc of the inode.
+ *
+ * Generally, we do not want to hold the i_rlock while holding the
+ * i_ilock. Hierarchy is i_iolock followed by i_rlock.
+ *
+ * xfs_iptr_t contains all the inode fields upto and including the
+ * i_mnext and i_mprev fields, it is used as a marker in the inode
+ * chain off the mount structure by xfs_sync calls.
+ */
+
+typedef struct {
+	struct xfs_ihash	*ip_hash;	/* pointer to hash header */
+	struct xfs_inode	*ip_next;	/* inode hash link forw */
+	struct xfs_inode	*ip_mnext;	/* next inode in mount list */
+	struct xfs_inode	*ip_mprev;	/* ptr to prev inode */
+	struct xfs_inode	**ip_prevp;	/* ptr to prev i_next */
+	struct xfs_mount	*ip_mount;	/* fs mount struct ptr */
+} xfs_iptr_t;
+
+typedef struct xfs_inode {
+	/* Inode linking and identification information. */
+	struct xfs_ihash	*i_hash;	/* pointer to hash header */
+	struct xfs_inode	*i_next;	/* inode hash link forw */
+	struct xfs_inode	*i_mnext;	/* next inode in mount list */
+	struct xfs_inode	*i_mprev;	/* ptr to prev inode */
+	struct xfs_inode	**i_prevp;	/* ptr to prev i_next */
+	struct xfs_mount	*i_mount;	/* fs mount struct ptr */
+	struct bhv_desc		i_bhv_desc;	/* inode behavior descriptor*/
+	struct xfs_dquot	*i_udquot;	/* user dquot */
+	struct xfs_dquot	*i_gdquot;	/* group dquot */
+
+	/* Inode location stuff */
+	xfs_ino_t		i_ino;		/* inode number (agno/agino)*/
+	xfs_daddr_t		i_blkno;	/* blkno of inode buffer */
+	ushort			i_len;		/* len of inode buffer */
+	ushort			i_boffset;	/* off of inode in buffer */
+
+	/* Extent information. */
+	xfs_ifork_t		*i_afp;		/* attribute fork pointer */
+	xfs_ifork_t		i_df;		/* data fork */
+
+	/* Transaction and locking information. */
+	struct xfs_trans	*i_transp;	/* ptr to owning transaction*/
+	struct xfs_inode_log_item *i_itemp;	/* logging information */
+	mrlock_t		i_lock;		/* inode lock */
+	mrlock_t		i_iolock;	/* inode IO lock */
+	sema_t			i_flock;	/* inode flush lock */
+	atomic_t		i_pincount;	/* inode pin count */
+	wait_queue_head_t	i_ipin_wait;	/* inode pinning wait queue */
+	struct xfs_inode	**i_refcache;	/* ptr to entry in ref cache */
+	struct xfs_inode	*i_release;	/* inode to unref */
+
+	/* I/O state */
+	xfs_iocore_t		i_iocore;	/* I/O core */
+
+	/* Miscellaneous state. */
+	unsigned short		i_flags;	/* see defined flags below */
+	unsigned char		i_update_core;	/* timestamps/size is dirty */
+	unsigned char		i_update_size;	/* di_size field is dirty */
+	unsigned int		i_gen;		/* generation count */
+	unsigned int		i_delayed_blks; /* count of delay alloc blks */
+
+	xfs_dinode_core_t	i_d;		/* most of ondisk inode */
+	xfs_chashlist_t		*i_chash;	/* cluster hash list header */
+	struct xfs_inode	*i_cnext;	/* cluster hash link forward */
+	struct xfs_inode	*i_cprev;	/* cluster hash link backward */
+
+#ifdef DEBUG
+	/* Trace buffers per inode. */
+	struct ktrace		*i_xtrace;	/* inode extent list trace */
+	struct ktrace		*i_btrace;	/* inode bmap btree trace */
+	struct ktrace		*i_rwtrace;	/* inode read/write trace */
+	struct ktrace		*i_strat_trace; /* inode strat_write trace */
+	struct ktrace		*i_lock_trace;	/* inode lock/unlock trace */
+	struct ktrace		*i_dir_trace;	/* inode directory trace */
+#endif /* DEBUG */
+} xfs_inode_t;
+
+#endif	/* __KERNEL__ */
+
+
+/*
+ * Fork handling.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_PTR)
+xfs_ifork_t *xfs_ifork_ptr(xfs_inode_t *ip, int w);
+#define XFS_IFORK_PTR(ip,w)		xfs_ifork_ptr(ip,w)
+#else
+#define XFS_IFORK_PTR(ip,w)   ((w) == XFS_DATA_FORK ? &(ip)->i_df : (ip)->i_afp)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_Q)
+int xfs_ifork_q(xfs_inode_t *ip);
+#define XFS_IFORK_Q(ip)			xfs_ifork_q(ip)
+#else
+#define XFS_IFORK_Q(ip)			XFS_CFORK_Q(&(ip)->i_d)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_DSIZE)
+int xfs_ifork_dsize(xfs_inode_t *ip);
+#define XFS_IFORK_DSIZE(ip)		xfs_ifork_dsize(ip)
+#else
+#define XFS_IFORK_DSIZE(ip)		XFS_CFORK_DSIZE(&ip->i_d, ip->i_mount)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_ASIZE)
+int xfs_ifork_asize(xfs_inode_t *ip);
+#define XFS_IFORK_ASIZE(ip)		xfs_ifork_asize(ip)
+#else
+#define XFS_IFORK_ASIZE(ip)		XFS_CFORK_ASIZE(&ip->i_d, ip->i_mount)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_SIZE)
+int xfs_ifork_size(xfs_inode_t *ip, int w);
+#define XFS_IFORK_SIZE(ip,w)		xfs_ifork_size(ip,w)
+#else
+#define XFS_IFORK_SIZE(ip,w)		XFS_CFORK_SIZE(&ip->i_d, ip->i_mount, w)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_FORMAT)
+int xfs_ifork_format(xfs_inode_t *ip, int w);
+#define XFS_IFORK_FORMAT(ip,w)		xfs_ifork_format(ip,w)
+#else
+#define XFS_IFORK_FORMAT(ip,w)		XFS_CFORK_FORMAT(&ip->i_d, w)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_FMT_SET)
+void xfs_ifork_fmt_set(xfs_inode_t *ip, int w, int n);
+#define XFS_IFORK_FMT_SET(ip,w,n)	xfs_ifork_fmt_set(ip,w,n)
+#else
+#define XFS_IFORK_FMT_SET(ip,w,n)	XFS_CFORK_FMT_SET(&ip->i_d, w, n)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_NEXTENTS)
+int xfs_ifork_nextents(xfs_inode_t *ip, int w);
+#define XFS_IFORK_NEXTENTS(ip,w)	xfs_ifork_nextents(ip,w)
+#else
+#define XFS_IFORK_NEXTENTS(ip,w)	XFS_CFORK_NEXTENTS(&ip->i_d, w)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_IFORK_NEXT_SET)
+void xfs_ifork_next_set(xfs_inode_t *ip, int w, int n);
+#define XFS_IFORK_NEXT_SET(ip,w,n)	xfs_ifork_next_set(ip,w,n)
+#else
+#define XFS_IFORK_NEXT_SET(ip,w,n)	XFS_CFORK_NEXT_SET(&ip->i_d, w, n)
+#endif
+
+
+#ifdef __KERNEL__
+
+/*
+ * In-core inode flags.
+ */
+#define XFS_IGRIO	0x0001	/* inode used for guaranteed rate i/o */
+#define XFS_IUIOSZ	0x0002	/* inode i/o sizes have been explicitly set */
+#define XFS_IQUIESCE	0x0004	/* we have started quiescing for this inode */
+#define XFS_IRECLAIM	0x0008	/* we have started reclaiming this inode    */
+
+/*
+ * Flags for inode locking.
+ */
+#define XFS_IOLOCK_EXCL		0x001
+#define XFS_IOLOCK_SHARED	0x002
+#define XFS_ILOCK_EXCL		0x004
+#define XFS_ILOCK_SHARED	0x008
+#define XFS_IUNLOCK_NONOTIFY	0x010
+#define XFS_EXTENT_TOKEN_RD	0x040
+#define XFS_SIZE_TOKEN_RD	0x080
+#define XFS_EXTSIZE_RD		(XFS_EXTENT_TOKEN_RD|XFS_SIZE_TOKEN_RD)
+#define XFS_WILLLEND		0x100	/* Always acquire tokens for lending */
+#define XFS_EXTENT_TOKEN_WR	(XFS_EXTENT_TOKEN_RD | XFS_WILLLEND)
+#define XFS_SIZE_TOKEN_WR	(XFS_SIZE_TOKEN_RD | XFS_WILLLEND)
+#define XFS_EXTSIZE_WR		(XFS_EXTSIZE_RD | XFS_WILLLEND)
+
+
+#define XFS_LOCK_MASK	\
+	(XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL | \
+	 XFS_ILOCK_SHARED | XFS_EXTENT_TOKEN_RD | XFS_SIZE_TOKEN_RD | \
+	 XFS_WILLLEND)
+
+/*
+ * Flags for xfs_iflush()
+ */
+#define XFS_IFLUSH_DELWRI_ELSE_SYNC	1
+#define XFS_IFLUSH_DELWRI_ELSE_ASYNC	2
+#define XFS_IFLUSH_SYNC			3
+#define XFS_IFLUSH_ASYNC		4
+#define XFS_IFLUSH_DELWRI		5
+
+/*
+ * Flags for xfs_iflush_all.
+ */
+#define XFS_FLUSH_ALL		0x1
+
+/*
+ * Flags for xfs_itruncate_start().
+ */
+#define XFS_ITRUNC_DEFINITE	0x1
+#define XFS_ITRUNC_MAYBE	0x2
+
+/*
+ * max file offset is 2^(31+PAGE_SHIFT) - 1 (due to linux page cache)
+ *
+ * NOTE: XFS itself can handle 2^63 - 1 (largest positive value of xfs_fsize_t)
+ * but Linux can't go above 2^(31+PAGE_SHIFT)-1: the Linux VM uses a 32 bit
+ * signed variable to index cache data, so 2^31 * PAGE_SIZE is as big as
+ * you can go.
+ */
+#define XFS_MAX_FILE_OFFSET	((long long)((1ULL<<(31+PAGE_SHIFT))-1ULL))
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ITOV)
+struct vnode *xfs_itov(xfs_inode_t *ip);
+#define XFS_ITOV(ip)		xfs_itov(ip)
+#else
+#define XFS_ITOV(ip)		BHV_TO_VNODE(XFS_ITOBHV(ip))
+#endif
+#define XFS_ITOV_NULL(ip)	BHV_TO_VNODE_NULL(XFS_ITOBHV(ip))
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ITOBHV)
+struct bhv_desc *xfs_itobhv(xfs_inode_t *ip);
+#define XFS_ITOBHV(ip)		xfs_itobhv(ip)
+#else
+#define XFS_ITOBHV(ip)		((struct bhv_desc *)(&((ip)->i_bhv_desc)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BHVTOI)
+xfs_inode_t *xfs_bhvtoi(struct bhv_desc *bhvp);
+#define XFS_BHVTOI(bhvp)	xfs_bhvtoi(bhvp)
+#else
+#define XFS_BHVTOI(bhvp)	\
+	((xfs_inode_t *)((char *)(bhvp) - \
+			 (char *)&(((xfs_inode_t *)0)->i_bhv_desc)))
+#endif
+
+#define BHV_IS_XFS(bdp)		(BHV_OPS(bdp) == &xfs_vnodeops)
+
+/*
+ * Pick the inode cluster hash bucket
+ * (m_chash is the same size as m_ihash)
+ */
+#define XFS_CHASH(mp,blk) ((mp)->m_chash + (((uint)blk) % (mp)->m_chsize))
+
+/*
+ * For multiple groups support: if ISGID bit is set in the parent
+ * directory, group of new file is set to that of the parent, and
+ * new subdirectory gets ISGID bit from parent.
+ */
+#define XFS_INHERIT_GID(pip, vfsp)	((pip) != NULL && \
+	(((vfsp)->vfs_flag & VFS_GRPID) || ((pip)->i_d.di_mode & ISGID)))
+
+/*
+ * xfs_iget.c prototypes.
+ */
+void		xfs_ihash_init(struct xfs_mount *);
+void		xfs_ihash_free(struct xfs_mount *);
+void		xfs_chash_init(struct xfs_mount *);
+void		xfs_chash_free(struct xfs_mount *);
+xfs_inode_t	*xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
+				  struct xfs_trans *);
+void		xfs_inode_lock_init(xfs_inode_t *, struct vnode *);
+int		xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
+			 uint, xfs_inode_t **, xfs_daddr_t);
+int		xfs_vn_iget(vfs_t *, struct vnode *, xfs_ino_t);
+void		xfs_iput(xfs_inode_t *, uint);
+void		xfs_iput_new(xfs_inode_t *, uint);
+void		xfs_ilock(xfs_inode_t *, uint);
+int		xfs_ilock_nowait(xfs_inode_t *, uint);
+void		xfs_iunlock(xfs_inode_t *, uint);
+void		xfs_ilock_demote(xfs_inode_t *, uint);
+void		xfs_iflock(xfs_inode_t *);
+int		xfs_iflock_nowait(xfs_inode_t *);
+uint		xfs_ilock_map_shared(xfs_inode_t *);
+void		xfs_iunlock_map_shared(xfs_inode_t *, uint);
+void		xfs_ifunlock(xfs_inode_t *);
+void		xfs_ireclaim(xfs_inode_t *);
+int		xfs_finish_reclaim(xfs_inode_t *, int, int);
+
+/*
+ * xfs_inode.c prototypes.
+ */
+int		xfs_inotobp(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
+			    xfs_dinode_t **, struct xfs_buf **, int *);
+int		xfs_itobp(struct xfs_mount *, struct xfs_trans *,
+			  xfs_inode_t *, xfs_dinode_t **, struct xfs_buf **,
+			  xfs_daddr_t);
+int		xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
+			  xfs_inode_t **, xfs_daddr_t);
+int		xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
+int		xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t, nlink_t,
+			   dev_t, struct cred *, xfs_prid_t, int,
+			   struct xfs_buf **, boolean_t *, xfs_inode_t **);
+void		xfs_xlate_dinode_core(xfs_caddr_t, struct xfs_dinode_core *, int,
+			   xfs_arch_t);
+int		xfs_ifree(struct xfs_trans *, xfs_inode_t *);
+int		xfs_atruncate_start(xfs_inode_t *);
+void		xfs_itruncate_start(xfs_inode_t *, uint, xfs_fsize_t);
+int		xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
+				     xfs_fsize_t, int, int);
+int		xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
+int		xfs_igrow_start(xfs_inode_t *, xfs_fsize_t, struct cred *);
+void		xfs_igrow_finish(struct xfs_trans *, xfs_inode_t *,
+				 xfs_fsize_t, int);
+
+void		xfs_idestroy_fork(xfs_inode_t *, int);
+void		xfs_idestroy(xfs_inode_t *);
+void		xfs_idata_realloc(xfs_inode_t *, int, int);
+void		xfs_iextract(xfs_inode_t *);
+void		xfs_iext_realloc(xfs_inode_t *, int, int);
+void		xfs_iroot_realloc(xfs_inode_t *, int, int);
+void		xfs_ipin(xfs_inode_t *);
+void		xfs_iunpin(xfs_inode_t *);
+int		xfs_iextents_copy(xfs_inode_t *, xfs_bmbt_rec_32_t *, int);
+int		xfs_iflush(xfs_inode_t *, uint);
+int		xfs_iflush_all(struct xfs_mount *, int);
+int		xfs_ibusy_check(xfs_inode_t *, int);
+int		xfs_iaccess(xfs_inode_t *, mode_t, cred_t *);
+uint		xfs_iroundup(uint);
+void		xfs_ichgtime(xfs_inode_t *, int);
+xfs_fsize_t	xfs_file_last_byte(xfs_inode_t *);
+void		xfs_lock_inodes(xfs_inode_t **, int, int, uint);
+
+#define xfs_ipincount(ip)	((unsigned int) atomic_read(&ip->i_pincount))
+
+
+
+#ifdef DEBUG
+void		xfs_isize_check(struct xfs_mount *, xfs_inode_t *, xfs_fsize_t);
+#else	/* DEBUG */
+#define xfs_isize_check(mp, ip, isize)
+#endif	/* DEBUG */
+
+#if defined(DEBUG)
+void		xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
+#else
+#define xfs_inobp_check(mp, bp)
+#endif /* DEBUG */
+
+extern struct kmem_zone	*xfs_chashlist_zone;
+extern struct kmem_zone	*xfs_ifork_zone;
+extern struct kmem_zone	*xfs_inode_zone;
+extern struct kmem_zone	*xfs_ili_zone;
+extern struct vnodeops	xfs_vnodeops;
+
+#ifdef XFS_ILOCK_TRACE
+#define XFS_ILOCK_KTRACE_SIZE	32
+void	xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags,
+			inst_t *ra);
+#endif
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
new file mode 100644
index 000000000000..9beacce25dec
--- /dev/null
+++ b/fs/xfs/xfs_inode_item.c
@@ -0,0 +1,1024 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * This file contains the implementation of the xfs_inode_log_item.
+ * It contains the item operations used to manipulate the inode log
+ * items as well as utility routines used by the inode specific
+ * transaction routines.
+ */
+#include <xfs.h>
+
+
+kmem_zone_t	*xfs_ili_zone;		/* inode log item zone */
+
+/*
+ * This returns the number of iovecs needed to log the given inode item.
+ *
+ * We need one iovec for the inode log format structure, one for the
+ * inode core, and possibly one for the inode data/extents/b-tree root
+ * and one for the inode attribute data/extents/b-tree root.
+ */
+STATIC uint
+xfs_inode_item_size(
+	xfs_inode_log_item_t	*iip)
+{
+	uint		nvecs;
+	xfs_inode_t	*ip;
+
+	ip = iip->ili_inode;
+	nvecs = 2;
+
+	/*
+	 * Only log the data/extents/b-tree root if there is something
+	 * left to log.
+	 */
+	iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
+
+	switch (ip->i_d.di_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+			  XFS_ILOG_DEV | XFS_ILOG_UUID);
+		if ((iip->ili_format.ilf_fields & XFS_ILOG_DEXT) &&
+		    (ip->i_d.di_nextents > 0) &&
+		    (ip->i_df.if_bytes > 0)) {
+			ASSERT(ip->i_df.if_u1.if_extents != NULL);
+			nvecs++;
+		} else {
+			iip->ili_format.ilf_fields &= ~XFS_ILOG_DEXT;
+		}
+		break;
+
+	case XFS_DINODE_FMT_BTREE:
+		ASSERT(ip->i_df.if_ext_max ==
+		       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
+			  XFS_ILOG_DEV | XFS_ILOG_UUID);
+		if ((iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) &&
+		    (ip->i_df.if_broot_bytes > 0)) {
+			ASSERT(ip->i_df.if_broot != NULL);
+			nvecs++;
+		} else {
+			ASSERT(!(iip->ili_format.ilf_fields &
+				 XFS_ILOG_DBROOT));
+#ifdef XFS_TRANS_DEBUG
+			if (iip->ili_root_size > 0) {
+				ASSERT(iip->ili_root_size ==
+				       ip->i_df.if_broot_bytes);
+				ASSERT(bcmp(iip->ili_orig_root,
+					    ip->i_df.if_broot,
+					    iip->ili_root_size) == 0);
+			} else {
+				ASSERT(ip->i_df.if_broot_bytes == 0);
+			}
+#endif
+			iip->ili_format.ilf_fields &= ~XFS_ILOG_DBROOT;
+		}
+		break;
+
+	case XFS_DINODE_FMT_LOCAL:
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
+			  XFS_ILOG_DEV | XFS_ILOG_UUID);
+		if ((iip->ili_format.ilf_fields & XFS_ILOG_DDATA) &&
+		    (ip->i_df.if_bytes > 0)) {
+			ASSERT(ip->i_df.if_u1.if_data != NULL);
+			ASSERT(ip->i_d.di_size > 0);
+			nvecs++;
+		} else {
+			iip->ili_format.ilf_fields &= ~XFS_ILOG_DDATA;
+		}
+		break;
+
+	case XFS_DINODE_FMT_DEV:
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+			  XFS_ILOG_DEXT | XFS_ILOG_UUID);
+		break;
+
+	case XFS_DINODE_FMT_UUID:
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+			  XFS_ILOG_DEXT | XFS_ILOG_DEV);
+		break;
+
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	/*
+	 * If there are no attributes associated with this file,
+	 * then there cannot be anything more to log.
+	 * Clear all attribute-related log flags.
+	 */
+	if (!XFS_IFORK_Q(ip)) {
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
+		return nvecs;
+	}
+
+	/*
+	 * Log any necessary attribute data.
+	 */
+	switch (ip->i_d.di_aformat) {
+	case XFS_DINODE_FMT_EXTENTS:
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
+		if ((iip->ili_format.ilf_fields & XFS_ILOG_AEXT) &&
+		    (ip->i_d.di_anextents > 0) &&
+		    (ip->i_afp->if_bytes > 0)) {
+			ASSERT(ip->i_afp->if_u1.if_extents != NULL);
+			nvecs++;
+		} else {
+			iip->ili_format.ilf_fields &= ~XFS_ILOG_AEXT;
+		}
+		break;
+
+	case XFS_DINODE_FMT_BTREE:
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
+		if ((iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) &&
+		    (ip->i_afp->if_broot_bytes > 0)) {
+			ASSERT(ip->i_afp->if_broot != NULL);
+			nvecs++;
+		} else {
+			iip->ili_format.ilf_fields &= ~XFS_ILOG_ABROOT;
+		}
+		break;
+
+	case XFS_DINODE_FMT_LOCAL:
+		iip->ili_format.ilf_fields &=
+			~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
+		if ((iip->ili_format.ilf_fields & XFS_ILOG_ADATA) &&
+		    (ip->i_afp->if_bytes > 0)) {
+			ASSERT(ip->i_afp->if_u1.if_data != NULL);
+			nvecs++;
+		} else {
+			iip->ili_format.ilf_fields &= ~XFS_ILOG_ADATA;
+		}
+		break;
+
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	return nvecs;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given inode log item.  It fills the first item with an inode
+ * log format structure, the second with the on-disk inode structure,
+ * and a possible third and/or fourth with the inode data/extents/b-tree
+ * root and inode attributes data/extents/b-tree root.
+ */
+STATIC void
+xfs_inode_item_format(
+	xfs_inode_log_item_t	*iip,
+	xfs_log_iovec_t		*log_vector)
+{
+	uint			nvecs;
+	xfs_log_iovec_t		*vecp;
+	xfs_inode_t		*ip;
+	size_t			data_bytes;
+	xfs_bmbt_rec_32_t	*ext_buffer;
+	int			nrecs;
+	xfs_mount_t		*mp;
+
+	ip = iip->ili_inode;
+	vecp = log_vector;
+
+	vecp->i_addr = (xfs_caddr_t)&iip->ili_format;
+	vecp->i_len  = sizeof(xfs_inode_log_format_t);
+	vecp++;
+	nvecs	     = 1;
+
+	/*
+	 * Clear i_update_core if the timestamps (or any other
+	 * non-transactional modification) need flushing/logging
+	 * and we're about to log them with the rest of the core.
+	 *
+	 * This is the same logic as xfs_iflush() but this code can't
+	 * run at the same time as xfs_iflush because we're in commit
+	 * processing here and so we have the inode lock held in
+	 * exclusive mode.  Although it doesn't really matter
+	 * for the timestamps if both routines were to grab the
+	 * timestamps or not.  That would be ok.
+	 *
+	 * We clear i_update_core before copying out the data.
+	 * This is for coordination with our timestamp updates
+	 * that don't hold the inode lock. They will always
+	 * update the timestamps BEFORE setting i_update_core,
+	 * so if we clear i_update_core after they set it we
+	 * are guaranteed to see their updates to the timestamps
+	 * either here.	 Likewise, if they set it after we clear it
+	 * here, we'll see it either on the next commit of this
+	 * inode or the next time the inode gets flushed via
+	 * xfs_iflush().  This depends on strongly ordered memory
+	 * semantics, but we have that.	 We use the SYNCHRONIZE
+	 * macro to make sure that the compiler does not reorder
+	 * the i_update_core access below the data copy below.
+	 */
+	if (ip->i_update_core)	{
+		ip->i_update_core = 0;
+		SYNCHRONIZE();
+	}
+
+	/*
+	 * We don't have to worry about re-ordering here because
+	 * the update_size field is protected by the inode lock
+	 * and we have that held in exclusive mode.
+	 */
+	if (ip->i_update_size)
+		ip->i_update_size = 0;
+
+	vecp->i_addr = (xfs_caddr_t)&ip->i_d;
+	vecp->i_len  = sizeof(xfs_dinode_core_t);
+	vecp++;
+	nvecs++;
+	iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
+
+	/*
+	 * If this is really an old format inode, then we need to
+	 * log it as such.  This means that we have to copy the link
+	 * count from the new field to the old.	 We don't have to worry
+	 * about the new fields, because nothing trusts them as long as
+	 * the old inode version number is there.  If the superblock already
+	 * has a new version number, then we don't bother converting back.
+	 */
+	mp = ip->i_mount;
+	ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
+	       XFS_SB_VERSION_HASNLINK(&mp->m_sb));
+	if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+		if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
+			/*
+			 * Convert it back.
+			 */
+			ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
+			ip->i_d.di_onlink = ip->i_d.di_nlink;
+		} else {
+			/*
+			 * The superblock version has already been bumped,
+			 * so just make the conversion to the new inode
+			 * format permanent.
+			 */
+			ip->i_d.di_version = XFS_DINODE_VERSION_2;
+			ip->i_d.di_onlink = 0;
+			bzero(&(ip->i_d.di_pad[0]), sizeof(ip->i_d.di_pad));
+		}
+	}
+
+	switch (ip->i_d.di_format) {
+	case XFS_DINODE_FMT_EXTENTS:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+			  XFS_ILOG_DEV | XFS_ILOG_UUID)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_DEXT) {
+			ASSERT(ip->i_df.if_bytes > 0);
+			ASSERT(ip->i_df.if_u1.if_extents != NULL);
+			ASSERT(ip->i_d.di_nextents > 0);
+			ASSERT(iip->ili_extents_buf == NULL);
+			nrecs = ip->i_df.if_bytes /
+				(uint)sizeof(xfs_bmbt_rec_t);
+			ASSERT(nrecs > 0);
+			if (nrecs == ip->i_d.di_nextents) {
+				/*
+				 * There are no delayed allocation
+				 * extents, so just point to the
+				 * real extents array.
+				 */
+				vecp->i_addr =
+					(char *)(ip->i_df.if_u1.if_extents);
+				vecp->i_len = ip->i_df.if_bytes;
+			} else {
+				/*
+				 * There are delayed allocation extents
+				 * in the inode.  Use xfs_iextents_copy()
+				 * to copy only the real extents into
+				 * a separate buffer.  We'll free the
+				 * buffer in the unlock routine.
+				 */
+				ext_buffer = kmem_alloc(ip->i_df.if_bytes,
+					KM_SLEEP);
+				iip->ili_extents_buf = ext_buffer;
+				vecp->i_addr = (xfs_caddr_t)ext_buffer;
+				vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
+					XFS_DATA_FORK);
+			}
+			ASSERT(vecp->i_len <= ip->i_df.if_bytes);
+			iip->ili_format.ilf_dsize = vecp->i_len;
+			vecp++;
+			nvecs++;
+		}
+		break;
+
+	case XFS_DINODE_FMT_BTREE:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_DDATA | XFS_ILOG_DEXT |
+			  XFS_ILOG_DEV | XFS_ILOG_UUID)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) {
+			ASSERT(ip->i_df.if_broot_bytes > 0);
+			ASSERT(ip->i_df.if_broot != NULL);
+			vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot;
+			vecp->i_len = ip->i_df.if_broot_bytes;
+			vecp++;
+			nvecs++;
+			iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
+		}
+		break;
+
+	case XFS_DINODE_FMT_LOCAL:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
+			  XFS_ILOG_DEV | XFS_ILOG_UUID)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_DDATA) {
+			ASSERT(ip->i_df.if_bytes > 0);
+			ASSERT(ip->i_df.if_u1.if_data != NULL);
+			ASSERT(ip->i_d.di_size > 0);
+
+			vecp->i_addr = (xfs_caddr_t)ip->i_df.if_u1.if_data;
+			/*
+			 * Round i_bytes up to a word boundary.
+			 * The underlying memory is guaranteed to
+			 * to be there by xfs_idata_realloc().
+			 */
+			data_bytes = roundup(ip->i_df.if_bytes, 4);
+			ASSERT((ip->i_df.if_real_bytes == 0) ||
+			       (ip->i_df.if_real_bytes == data_bytes));
+			vecp->i_len = (int)data_bytes;
+			vecp++;
+			nvecs++;
+			iip->ili_format.ilf_dsize = (unsigned)data_bytes;
+		}
+		break;
+
+	case XFS_DINODE_FMT_DEV:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
+			  XFS_ILOG_DDATA | XFS_ILOG_UUID)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
+			iip->ili_format.ilf_u.ilfu_rdev =
+				ip->i_df.if_u2.if_rdev;
+		}
+		break;
+
+	case XFS_DINODE_FMT_UUID:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
+			  XFS_ILOG_DDATA | XFS_ILOG_DEV)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
+			iip->ili_format.ilf_u.ilfu_uuid =
+				ip->i_df.if_u2.if_uuid;
+		}
+		break;
+
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	/*
+	 * If there are no attributes associated with the file,
+	 * then we're done.
+	 * Assert that no attribute-related log flags are set.
+	 */
+	if (!XFS_IFORK_Q(ip)) {
+		ASSERT(nvecs == iip->ili_item.li_desc->lid_size);
+		iip->ili_format.ilf_size = nvecs;
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
+		return;
+	}
+
+	switch (ip->i_d.di_aformat) {
+	case XFS_DINODE_FMT_EXTENTS:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_ADATA | XFS_ILOG_ABROOT)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) {
+			ASSERT(ip->i_afp->if_bytes > 0);
+			ASSERT(ip->i_afp->if_u1.if_extents != NULL);
+			ASSERT(ip->i_d.di_anextents > 0);
+#ifdef DEBUG
+			nrecs = ip->i_afp->if_bytes /
+				(uint)sizeof(xfs_bmbt_rec_t);
+#endif
+			ASSERT(nrecs > 0);
+			ASSERT(nrecs == ip->i_d.di_anextents);
+			/*
+			 * There are not delayed allocation extents
+			 * for attributes, so just point at the array.
+			 */
+			vecp->i_addr = (char *)(ip->i_afp->if_u1.if_extents);
+			vecp->i_len = ip->i_afp->if_bytes;
+			iip->ili_format.ilf_asize = vecp->i_len;
+			vecp++;
+			nvecs++;
+		}
+		break;
+
+	case XFS_DINODE_FMT_BTREE:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_ADATA | XFS_ILOG_AEXT)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) {
+			ASSERT(ip->i_afp->if_broot_bytes > 0);
+			ASSERT(ip->i_afp->if_broot != NULL);
+			vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot;
+			vecp->i_len = ip->i_afp->if_broot_bytes;
+			vecp++;
+			nvecs++;
+			iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
+		}
+		break;
+
+	case XFS_DINODE_FMT_LOCAL:
+		ASSERT(!(iip->ili_format.ilf_fields &
+			 (XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
+		if (iip->ili_format.ilf_fields & XFS_ILOG_ADATA) {
+			ASSERT(ip->i_afp->if_bytes > 0);
+			ASSERT(ip->i_afp->if_u1.if_data != NULL);
+
+			vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_u1.if_data;
+			/*
+			 * Round i_bytes up to a word boundary.
+			 * The underlying memory is guaranteed to
+			 * to be there by xfs_idata_realloc().
+			 */
+			data_bytes = roundup(ip->i_afp->if_bytes, 4);
+			ASSERT((ip->i_afp->if_real_bytes == 0) ||
+			       (ip->i_afp->if_real_bytes == data_bytes));
+			vecp->i_len = (int)data_bytes;
+			vecp++;
+			nvecs++;
+			iip->ili_format.ilf_asize = (unsigned)data_bytes;
+		}
+		break;
+
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	ASSERT(nvecs == iip->ili_item.li_desc->lid_size);
+	iip->ili_format.ilf_size = nvecs;
+}
+
+
+/*
+ * This is called to pin the inode associated with the inode log
+ * item in memory so it cannot be written out.	Do this by calling
+ * xfs_ipin() to bump the pin count in the inode while holding the
+ * inode pin lock.
+ */
+STATIC void
+xfs_inode_item_pin(
+	xfs_inode_log_item_t	*iip)
+{
+	ASSERT(ismrlocked(&(iip->ili_inode->i_lock), MR_UPDATE));
+	xfs_ipin(iip->ili_inode);
+}
+
+
+/*
+ * This is called to unpin the inode associated with the inode log
+ * item which was previously pinned with a call to xfs_inode_item_pin().
+ * Just call xfs_iunpin() on the inode to do this.
+ */
+STATIC void
+xfs_inode_item_unpin(
+	xfs_inode_log_item_t	*iip)
+{
+	xfs_iunpin(iip->ili_inode);
+}
+
+/* ARGSUSED */
+STATIC void
+xfs_inode_item_unpin_remove(
+	xfs_inode_log_item_t	*iip,
+	xfs_trans_t		*tp)
+{
+	xfs_iunpin(iip->ili_inode);
+}
+
+/*
+ * This is called to attempt to lock the inode associated with this
+ * inode log item, in preparation for the push routine which does the actual
+ * iflush.  Don't sleep on the inode lock or the flush lock.
+ *
+ * If the flush lock is already held, indicating that the inode has
+ * been or is in the process of being flushed, then (ideally) we'd like to
+ * see if the inode's buffer is still incore, and if so give it a nudge.
+ * We delay doing so until the pushbuf routine, though, to avoid holding
+ * the AIL lock across a call to the blackhole which is the buffercache.
+ * Also we don't want to sleep in any device strategy routines, which can happen
+ * if we do the subsequent bawrite in here.
+ */
+STATIC uint
+xfs_inode_item_trylock(
+	xfs_inode_log_item_t	*iip)
+{
+	register xfs_inode_t	*ip;
+
+	ip = iip->ili_inode;
+
+	if (xfs_ipincount(ip) > 0) {
+		return XFS_ITEM_PINNED;
+	}
+
+	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+		return XFS_ITEM_LOCKED;
+	}
+
+	if (!xfs_iflock_nowait(ip)) {
+		/*
+		 * If someone else isn't already trying to push the inode
+		 * buffer, we get to do it.
+		 */
+		if (iip->ili_pushbuf_flag == 0) {
+			iip->ili_pushbuf_flag = 1;
+#ifdef DEBUG
+			iip->ili_push_owner = get_thread_id();
+#endif
+			/*
+			 * Inode is left locked in shared mode.
+			 * Pushbuf routine gets to unlock it.
+			 */
+			return XFS_ITEM_PUSHBUF;
+		} else {
+			/*
+			 * We hold the AIL_LOCK, so we must specify the
+			 * NONOTIFY flag so that we won't double trip.
+			 */
+			xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
+			return XFS_ITEM_FLUSHING;
+		}
+		/* NOTREACHED */
+	}
+#ifdef DEBUG
+	if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		ASSERT(iip->ili_format.ilf_fields != 0);
+		ASSERT(iip->ili_logged == 0);
+		ASSERT(iip->ili_item.li_flags & XFS_LI_IN_AIL);
+	}
+#endif
+	return XFS_ITEM_SUCCESS;
+}
+
+/*
+ * Unlock the inode associated with the inode log item.
+ * Clear the fields of the inode and inode log item that
+ * are specific to the current transaction.  If the
+ * hold flags is set, do not unlock the inode.
+ */
+STATIC void
+xfs_inode_item_unlock(
+	xfs_inode_log_item_t	*iip)
+{
+	uint		hold;
+	uint		iolocked;
+	uint		lock_flags;
+	xfs_inode_t	*ip;
+
+	ASSERT(iip != NULL);
+	ASSERT(iip->ili_inode->i_itemp != NULL);
+	ASSERT(ismrlocked(&(iip->ili_inode->i_lock), MR_UPDATE));
+	ASSERT((!(iip->ili_inode->i_itemp->ili_flags &
+		  XFS_ILI_IOLOCKED_EXCL)) ||
+	       ismrlocked(&(iip->ili_inode->i_iolock), MR_UPDATE));
+	ASSERT((!(iip->ili_inode->i_itemp->ili_flags &
+		  XFS_ILI_IOLOCKED_SHARED)) ||
+	       ismrlocked(&(iip->ili_inode->i_iolock), MR_ACCESS));
+	/*
+	 * Clear the transaction pointer in the inode.
+	 */
+	ip = iip->ili_inode;
+	ip->i_transp = NULL;
+
+	/*
+	 * If the inode needed a separate buffer with which to log
+	 * its extents, then free it now.
+	 */
+	/* FIXME */
+	if (iip->ili_extents_buf != NULL) {
+		ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS);
+		ASSERT(ip->i_d.di_nextents > 0);
+		ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT);
+		ASSERT(ip->i_df.if_bytes > 0);
+		kmem_free(iip->ili_extents_buf, ip->i_df.if_bytes);
+		iip->ili_extents_buf = NULL;
+	}
+
+	/*
+	 * Figure out if we should unlock the inode or not.
+	 */
+	hold = iip->ili_flags & XFS_ILI_HOLD;
+
+	/*
+	 * Before clearing out the flags, remember whether we
+	 * are holding the inode's IO lock.
+	 */
+	iolocked = iip->ili_flags & XFS_ILI_IOLOCKED_ANY;
+
+	/*
+	 * Clear out the fields of the inode log item particular
+	 * to the current transaction.
+	 */
+	iip->ili_ilock_recur = 0;
+	iip->ili_iolock_recur = 0;
+	iip->ili_flags = 0;
+
+	/*
+	 * Unlock the inode if XFS_ILI_HOLD was not set.
+	 */
+	if (!hold) {
+		lock_flags = XFS_ILOCK_EXCL;
+		if (iolocked & XFS_ILI_IOLOCKED_EXCL) {
+			lock_flags |= XFS_IOLOCK_EXCL;
+		} else if (iolocked & XFS_ILI_IOLOCKED_SHARED) {
+			lock_flags |= XFS_IOLOCK_SHARED;
+		}
+		xfs_iput(iip->ili_inode, lock_flags);
+	}
+}
+
+/*
+ * This is called to find out where the oldest active copy of the
+ * inode log item in the on disk log resides now that the last log
+ * write of it completed at the given lsn.  Since we always re-log
+ * all dirty data in an inode, the latest copy in the on disk log
+ * is the only one that matters.  Therefore, simply return the
+ * given lsn.
+ */
+/*ARGSUSED*/
+STATIC xfs_lsn_t
+xfs_inode_item_committed(
+	xfs_inode_log_item_t	*iip,
+	xfs_lsn_t		lsn)
+{
+	return (lsn);
+}
+
+/*
+ * The transaction with the inode locked has aborted.  The inode
+ * must not be dirty within the transaction (unless we're forcibly
+ * shutting down).  We simply unlock just as if the transaction
+ * had been cancelled.
+ */
+STATIC void
+xfs_inode_item_abort(
+	xfs_inode_log_item_t	*iip)
+{
+	xfs_inode_item_unlock(iip);
+	return;
+}
+
+
+/*
+ * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
+ * failed to get the inode flush lock but did get the inode locked SHARED.
+ * Here we're trying to see if the inode buffer is incore, and if so whether it's
+ * marked delayed write. If that's the case, we'll initiate a bawrite on that
+ * buffer to expedite the process.
+ *
+ * We aren't holding the AIL_LOCK (or the flush lock) when this gets called,
+ * so it is inherently race-y.
+ */
+STATIC void
+xfs_inode_item_pushbuf(
+	xfs_inode_log_item_t	*iip)
+{
+	xfs_inode_t	*ip;
+	xfs_mount_t	*mp;
+	xfs_buf_t	*bp;
+	uint		dopush;
+
+	ip = iip->ili_inode;
+
+	ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS));
+
+	/*
+	 * The ili_pushbuf_flag keeps others from
+	 * trying to duplicate our effort.
+	 */
+	ASSERT(iip->ili_pushbuf_flag != 0);
+	ASSERT(iip->ili_push_owner == get_thread_id());
+
+	/*
+	 * If flushlock isn't locked anymore, chances are that the
+	 * inode flush completed and the inode was taken off the AIL.
+	 * So, just get out.
+	 */
+	if ((valusema(&(ip->i_flock)) > 0)  ||
+	    ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
+		iip->ili_pushbuf_flag = 0;
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		return;
+	}
+
+	mp = ip->i_mount;
+	bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno,
+		    iip->ili_format.ilf_len, XFS_INCORE_TRYLOCK);
+
+	if (bp != NULL) {
+		if (XFS_BUF_ISDELAYWRITE(bp)) {
+			/*
+			 * We were racing with iflush because we don't hold
+			 * the AIL_LOCK or the flush lock. However, at this point,
+			 * we have the buffer, and we know that it's dirty.
+			 * So, it's possible that iflush raced with us, and
+			 * this item is already taken off the AIL.
+			 * If not, we can flush it async.
+			 */
+			dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
+				  (valusema(&(ip->i_flock)) <= 0));
+			iip->ili_pushbuf_flag = 0;
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+			xfs_buftrace("INODE ITEM PUSH", bp);
+			if (XFS_BUF_ISPINNED(bp)) {
+				xfs_log_force(mp, (xfs_lsn_t)0,
+					      XFS_LOG_FORCE);
+			}
+			if (dopush) {
+				xfs_bawrite(mp, bp);
+			} else {
+				xfs_buf_relse(bp);
+			}
+		} else {
+			iip->ili_pushbuf_flag = 0;
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+			xfs_buf_relse(bp);
+		}
+		return;
+	}
+	/*
+	 * We have to be careful about resetting pushbuf flag too early (above).
+	 * Even though in theory we can do it as soon as we have the buflock,
+	 * we don't want others to be doing work needlessly. They'll come to
+	 * this function thinking that pushing the buffer is their
+	 * responsibility only to find that the buffer is still locked by
+	 * another doing the same thing
+	 */
+	iip->ili_pushbuf_flag = 0;
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	return;
+}
+
+
+/*
+ * This is called to asynchronously write the inode associated with this
+ * inode log item out to disk. The inode will already have been locked by
+ * a successful call to xfs_inode_item_trylock().
+ */
+STATIC void
+xfs_inode_item_push(
+	xfs_inode_log_item_t	*iip)
+{
+	xfs_inode_t	*ip;
+
+	ip = iip->ili_inode;
+
+	ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS));
+	ASSERT(valusema(&(ip->i_flock)) <= 0);
+	/*
+	 * Since we were able to lock the inode's flush lock and
+	 * we found it on the AIL, the inode must be dirty.  This
+	 * is because the inode is removed from the AIL while still
+	 * holding the flush lock in xfs_iflush_done().	 Thus, if
+	 * we found it in the AIL and were able to obtain the flush
+	 * lock without sleeping, then there must not have been
+	 * anyone in the process of flushing the inode.
+	 */
+	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) ||
+	       iip->ili_format.ilf_fields != 0);
+
+	/*
+	 * Write out the inode.	 The completion routine ('iflush_done') will
+	 * pull it from the AIL, mark it clean, unlock the flush lock.
+	 */
+	(void) xfs_iflush(ip, XFS_IFLUSH_DELWRI);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	return;
+}
+
+/*
+ * XXX rcc - this one really has to do something.  Probably needs
+ * to stamp in a new field in the incore inode.
+ */
+/* ARGSUSED */
+STATIC void
+xfs_inode_item_committing(
+	xfs_inode_log_item_t	*iip,
+	xfs_lsn_t		lsn)
+{
+	iip->ili_last_lsn = lsn;
+	return;
+}
+
+/*
+ * This is the ops vector shared by all buf log items.
+ */
+struct xfs_item_ops xfs_inode_item_ops = {
+	.iop_size	= (uint(*)(xfs_log_item_t*))xfs_inode_item_size,
+	.iop_format	= (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
+					xfs_inode_item_format,
+	.iop_pin	= (void(*)(xfs_log_item_t*))xfs_inode_item_pin,
+	.iop_unpin	= (void(*)(xfs_log_item_t*))xfs_inode_item_unpin,
+	.iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*))
+					xfs_inode_item_unpin_remove,
+	.iop_trylock	= (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock,
+	.iop_unlock	= (void(*)(xfs_log_item_t*))xfs_inode_item_unlock,
+	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_inode_item_committed,
+	.iop_push	= (void(*)(xfs_log_item_t*))xfs_inode_item_push,
+	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_inode_item_abort,
+	.iop_pushbuf	= (void(*)(xfs_log_item_t*))xfs_inode_item_pushbuf,
+	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
+					xfs_inode_item_committing
+};
+
+
+/*
+ * Initialize the inode log item for a newly allocated (in-core) inode.
+ */
+void
+xfs_inode_item_init(
+	xfs_inode_t	*ip,
+	xfs_mount_t	*mp)
+{
+	xfs_inode_log_item_t	*iip;
+
+	ASSERT(ip->i_itemp == NULL);
+	iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP);
+
+	iip->ili_item.li_type = XFS_LI_INODE;
+	iip->ili_item.li_ops = &xfs_inode_item_ops;
+	iip->ili_item.li_mountp = mp;
+	iip->ili_inode = ip;
+
+	/*
+	   We have bzeroed memory. No need ...
+	   iip->ili_extents_buf = NULL;
+	   iip->ili_pushbuf_flag = 0;
+	 */
+
+	iip->ili_format.ilf_type = XFS_LI_INODE;
+	iip->ili_format.ilf_ino = ip->i_ino;
+	iip->ili_format.ilf_blkno = ip->i_blkno;
+	iip->ili_format.ilf_len = ip->i_len;
+	iip->ili_format.ilf_boffset = ip->i_boffset;
+}
+
+/*
+ * Free the inode log item and any memory hanging off of it.
+ */
+void
+xfs_inode_item_destroy(
+	xfs_inode_t	*ip)
+{
+#ifdef XFS_TRANS_DEBUG
+	if (ip->i_itemp->ili_root_size != 0) {
+		kmem_free(ip->i_itemp->ili_orig_root,
+			  ip->i_itemp->ili_root_size);
+	}
+#endif
+	kmem_zone_free(xfs_ili_zone, ip->i_itemp);
+}
+
+
+/*
+ * This is the inode flushing I/O completion routine.  It is called
+ * from interrupt level when the buffer containing the inode is
+ * flushed to disk.  It is responsible for removing the inode item
+ * from the AIL if it has not been re-logged, and unlocking the inode's
+ * flush lock.
+ */
+/*ARGSUSED*/
+void
+xfs_iflush_done(
+	xfs_buf_t		*bp,
+	xfs_inode_log_item_t	*iip)
+{
+	xfs_inode_t	*ip;
+	SPLDECL(s);
+
+	ip = iip->ili_inode;
+
+	/*
+	 * We only want to pull the item from the AIL if it is
+	 * actually there and its location in the log has not
+	 * changed since we started the flush.	Thus, we only bother
+	 * if the ili_logged flag is set and the inode's lsn has not
+	 * changed.  First we check the lsn outside
+	 * the lock since it's cheaper, and then we recheck while
+	 * holding the lock before removing the inode from the AIL.
+	 */
+	if (iip->ili_logged &&
+	    (iip->ili_item.li_lsn == iip->ili_flush_lsn)) {
+		AIL_LOCK(ip->i_mount, s);
+		if (iip->ili_item.li_lsn == iip->ili_flush_lsn) {
+			/*
+			 * xfs_trans_delete_ail() drops the AIL lock.
+			 */
+			xfs_trans_delete_ail(ip->i_mount,
+					     (xfs_log_item_t*)iip, s);
+		} else {
+			AIL_UNLOCK(ip->i_mount, s);
+		}
+	}
+
+	iip->ili_logged = 0;
+
+	/*
+	 * Clear the ili_last_fields bits now that we know that the
+	 * data corresponding to them is safely on disk.
+	 */
+	iip->ili_last_fields = 0;
+
+	/*
+	 * Release the inode's flush lock since we're done with it.
+	 */
+	xfs_ifunlock(ip);
+
+	return;
+}
+
+/*
+ * This is the inode flushing abort routine.  It is called
+ * from xfs_iflush when the filesystem is shutting down to clean
+ * up the inode state.
+ * It is responsible for removing the inode item
+ * from the AIL if it has not been re-logged, and unlocking the inode's
+ * flush lock.
+ */
+void
+xfs_iflush_abort(
+	xfs_inode_t		*ip)
+{
+	xfs_inode_log_item_t	*iip;
+	xfs_mount_t		*mp;
+	SPLDECL(s);
+
+	iip = ip->i_itemp;
+	mp = ip->i_mount;
+	if (iip) {
+		if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
+			AIL_LOCK(mp, s);
+			if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
+				/*
+				 * xfs_trans_delete_ail() drops the AIL lock.
+				 */
+				xfs_trans_delete_ail(mp, (xfs_log_item_t *)iip,
+					s);
+			} else
+				AIL_UNLOCK(mp, s);
+		}
+		iip->ili_logged = 0;
+		/*
+		 * Clear the ili_last_fields bits now that we know that the
+		 * data corresponding to them is safely on disk.
+		 */
+		iip->ili_last_fields = 0;
+		/*
+		 * Clear the inode logging fields so no more flushes are
+		 * attempted.
+		 */
+		iip->ili_format.ilf_fields = 0;
+	}
+	/*
+	 * Release the inode's flush lock since we're done with it.
+	 */
+	xfs_ifunlock(ip);
+}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
new file mode 100644
index 000000000000..d90407088842
--- /dev/null
+++ b/fs/xfs/xfs_inode_item.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_INODE_ITEM_H__
+#define __XFS_INODE_ITEM_H__
+
+/*
+ * This is the structure used to lay out an inode log item in the
+ * log.	 The size of the inline data/extents/b-tree root to be logged
+ * (if any) is indicated in the ilf_dsize field.  Changes to this structure
+ * must be added on to the end.
+ *
+ * Convention for naming inode log item versions :  The current version
+ * is always named XFS_LI_INODE.  When an inode log item gets superseded,
+ * add the latest version of IRIX that will generate logs with that item
+ * to the version name.
+ *
+ * -Version 1 of this structure (XFS_LI_5_3_INODE) included up to the first
+ *	union (ilf_u) field.  This was released with IRIX 5.3-XFS.
+ * -Version 2 of this structure (XFS_LI_6_1_INODE) is currently the entire
+ *	structure.  This was released with IRIX 6.0.1-XFS and IRIX 6.1.
+ * -Version 3 of this structure (XFS_LI_INODE) is the same as version 2
+ *	so a new structure definition wasn't necessary.	 However, we had
+ *	to add a new type because the inode cluster size changed from 4K
+ *	to 8K and the version number had to be rev'ved to keep older kernels
+ *	from trying to recover logs with the 8K buffers in them.  The logging
+ *	code can handle recovery on different-sized clusters now so hopefully
+ *	this'll be the last time we need to change the inode log item just
+ *	for a change in the inode cluster size.	 This new version was
+ *	released with IRIX 6.2.
+ */
+typedef struct xfs_inode_log_format {
+	unsigned short		ilf_type;	/* inode log item type */
+	unsigned short		ilf_size;	/* size of this item */
+	uint			ilf_fields;	/* flags for fields logged */
+	ushort			ilf_asize;	/* size of attr d/ext/root */
+	ushort			ilf_dsize;	/* size of data/ext/root */
+	xfs_ino_t		ilf_ino;	/* inode number */
+	union {
+		xfs_dev_t	ilfu_rdev;	/* rdev value for dev inode*/
+		uuid_t		ilfu_uuid;	/* mount point value */
+	} ilf_u;
+	__int64_t		ilf_blkno;	/* blkno of inode buffer */
+	int			ilf_len;	/* len of inode buffer */
+	int			ilf_boffset;	/* off of inode in buffer */
+} xfs_inode_log_format_t;
+
+/* Initial version shipped with IRIX 5.3-XFS */
+typedef struct xfs_inode_log_format_v1 {
+	unsigned short		ilf_type;	/* inode log item type */
+	unsigned short		ilf_size;	/* size of this item */
+	uint			ilf_fields;	/* flags for fields logged */
+	uint			ilf_dsize;	/* size of data/ext/root */
+	xfs_ino_t		ilf_ino;	/* inode number */
+	union {
+		xfs_dev_t	ilfu_rdev;	/* rdev value for dev inode*/
+		uuid_t		ilfu_uuid;	/* mount point value */
+	} ilf_u;
+} xfs_inode_log_format_t_v1;
+
+/*
+ * Flags for xfs_trans_log_inode flags field.
+ */
+#define XFS_ILOG_CORE	0x001	/* log standard inode fields */
+#define XFS_ILOG_DDATA	0x002	/* log i_df.if_data */
+#define XFS_ILOG_DEXT	0x004	/* log i_df.if_extents */
+#define XFS_ILOG_DBROOT 0x008	/* log i_df.i_broot */
+#define XFS_ILOG_DEV	0x010	/* log the dev field */
+#define XFS_ILOG_UUID	0x020	/* log the uuid field */
+#define XFS_ILOG_ADATA	0x040	/* log i_af.if_data */
+#define XFS_ILOG_AEXT	0x080	/* log i_af.if_extents */
+#define XFS_ILOG_ABROOT 0x100	/* log i_af.i_broot */
+
+#define XFS_ILOG_NONCORE	(XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+				 XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
+				 XFS_ILOG_UUID | XFS_ILOG_ADATA | \
+				 XFS_ILOG_AEXT | XFS_ILOG_ABROOT)
+
+#define XFS_ILOG_DFORK		(XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+				 XFS_ILOG_DBROOT)
+
+#define XFS_ILOG_AFORK		(XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+				 XFS_ILOG_ABROOT)
+
+#define XFS_ILOG_ALL		(XFS_ILOG_CORE | XFS_ILOG_DDATA | \
+				 XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
+				 XFS_ILOG_DEV | XFS_ILOG_UUID | \
+				 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+				 XFS_ILOG_ABROOT)
+
+#define XFS_ILI_HOLD		0x1
+#define XFS_ILI_IOLOCKED_EXCL	0x2
+#define XFS_ILI_IOLOCKED_SHARED 0x4
+
+#define XFS_ILI_IOLOCKED_ANY   (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)
+
+
+#ifdef __KERNEL__
+
+struct xfs_buf;
+struct xfs_bmbt_rec_32;
+struct xfs_inode;
+struct xfs_mount;
+
+
+typedef struct xfs_inode_log_item {
+	xfs_log_item_t		ili_item;	   /* common portion */
+	struct xfs_inode	*ili_inode;	   /* inode ptr */
+	xfs_lsn_t		ili_flush_lsn;	   /* lsn at last flush */
+	xfs_lsn_t		ili_last_lsn;	   /* lsn at last transaction */
+	unsigned short		ili_ilock_recur;   /* lock recursion count */
+	unsigned short		ili_iolock_recur;  /* lock recursion count */
+	unsigned short		ili_flags;	   /* misc flags */
+	unsigned short		ili_logged;	   /* flushed logged data */
+	unsigned int		ili_last_fields;   /* fields when flushed */
+	struct xfs_bmbt_rec_32	*ili_extents_buf;  /* array of logged exts */
+	unsigned int		ili_pushbuf_flag;  /* one bit used in push_ail */
+
+#ifdef DEBUG
+	uint64_t		ili_push_owner;	   /* one who sets pushbuf_flag
+						      above gets to push the buf */
+#endif
+#ifdef XFS_TRANS_DEBUG
+	int			ili_root_size;
+	char			*ili_orig_root;
+#endif
+	xfs_inode_log_format_t	ili_format;	   /* logged structure */
+} xfs_inode_log_item_t;
+
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ILOG_FDATA)
+int xfs_ilog_fdata(int w);
+#define XFS_ILOG_FDATA(w)	xfs_ilog_fdata(w)
+#else
+#define XFS_ILOG_FDATA(w)	\
+	((w) == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA)
+#endif
+
+#endif	/* __KERNEL__ */
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ILOG_FBROOT)
+int xfs_ilog_fbroot(int w);
+#define XFS_ILOG_FBROOT(w)	xfs_ilog_fbroot(w)
+#else
+#define XFS_ILOG_FBROOT(w)	\
+	((w) == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_ILOG_FEXT)
+int xfs_ilog_fext(int w);
+#define XFS_ILOG_FEXT(w)	xfs_ilog_fext(w)
+#else
+#define XFS_ILOG_FEXT(w)	\
+	((w) == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT)
+#endif
+
+#ifdef __KERNEL__
+
+void	xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
+void	xfs_inode_item_destroy(struct xfs_inode *);
+void	xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *);
+void	xfs_iflush_abort(struct xfs_inode *);
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_INODE_ITEM_H__ */
diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h
new file mode 100644
index 000000000000..7fa4aca62d09
--- /dev/null
+++ b/fs/xfs/xfs_inum.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_INUM_H__
+#define __XFS_INUM_H__
+
+/*
+ * Inode number format:
+ * low inopblog bits - offset in block
+ * next agblklog bits - block number in ag
+ * next agno_log bits - ag number
+ * high agno_log-agblklog-inopblog bits - 0
+ */
+
+typedef __uint32_t	xfs_agino_t;	/* within allocation grp inode number */
+
+/*
+ * Useful inode bits for this kernel.
+ * Used in some places where having 64-bits in the 32-bit kernels
+ * costs too much.
+ */
+#if XFS_BIG_FILESYSTEMS
+typedef xfs_ino_t	xfs_intino_t;
+#else
+typedef __uint32_t	xfs_intino_t;
+#endif
+
+#define NULLFSINO	((xfs_ino_t)-1)
+#define NULLAGINO	((xfs_agino_t)-1)
+
+struct xfs_mount;
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_MASK)
+__uint32_t xfs_ino_mask(int k);
+#define XFS_INO_MASK(k)			xfs_ino_mask(k)
+#else
+#define XFS_INO_MASK(k) ((__uint32_t)((1ULL << (k)) - 1))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_OFFSET_BITS)
+int xfs_ino_offset_bits(struct xfs_mount *mp);
+#define XFS_INO_OFFSET_BITS(mp)		xfs_ino_offset_bits(mp)
+#else
+#define XFS_INO_OFFSET_BITS(mp) ((mp)->m_sb.sb_inopblog)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_AGBNO_BITS)
+int xfs_ino_agbno_bits(struct xfs_mount *mp);
+#define XFS_INO_AGBNO_BITS(mp)		xfs_ino_agbno_bits(mp)
+#else
+#define XFS_INO_AGBNO_BITS(mp)	((mp)->m_sb.sb_agblklog)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_AGINO_BITS)
+int xfs_ino_agino_bits(struct xfs_mount *mp);
+#define XFS_INO_AGINO_BITS(mp)		xfs_ino_agino_bits(mp)
+#else
+#define XFS_INO_AGINO_BITS(mp)		((mp)->m_agino_log)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_AGNO_BITS)
+int xfs_ino_agno_bits(struct xfs_mount *mp);
+#define XFS_INO_AGNO_BITS(mp)		xfs_ino_agno_bits(mp)
+#else
+#define XFS_INO_AGNO_BITS(mp)	((mp)->m_agno_log)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_BITS)
+int xfs_ino_bits(struct xfs_mount *mp);
+#define XFS_INO_BITS(mp)		xfs_ino_bits(mp)
+#else
+#define XFS_INO_BITS(mp)	(XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_AGNO)
+xfs_agnumber_t xfs_ino_to_agno(struct xfs_mount *mp, xfs_ino_t i);
+#define XFS_INO_TO_AGNO(mp,i)		xfs_ino_to_agno(mp,i)
+#else
+#define XFS_INO_TO_AGNO(mp,i)	\
+	((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_AGINO)
+xfs_agino_t xfs_ino_to_agino(struct xfs_mount *mp, xfs_ino_t i);
+#define XFS_INO_TO_AGINO(mp,i)		xfs_ino_to_agino(mp,i)
+#else
+#define XFS_INO_TO_AGINO(mp,i)	\
+	((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_AGBNO)
+xfs_agblock_t xfs_ino_to_agbno(struct xfs_mount *mp, xfs_ino_t i);
+#define XFS_INO_TO_AGBNO(mp,i)		xfs_ino_to_agbno(mp,i)
+#else
+#define XFS_INO_TO_AGBNO(mp,i)	\
+	(((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \
+	 XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_OFFSET)
+int xfs_ino_to_offset(struct xfs_mount *mp, xfs_ino_t i);
+#define XFS_INO_TO_OFFSET(mp,i)		xfs_ino_to_offset(mp,i)
+#else
+#define XFS_INO_TO_OFFSET(mp,i) \
+	((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INO_TO_FSB)
+xfs_fsblock_t xfs_ino_to_fsb(struct xfs_mount *mp, xfs_ino_t i);
+#define XFS_INO_TO_FSB(mp,i)		xfs_ino_to_fsb(mp,i)
+#else
+#define XFS_INO_TO_FSB(mp,i)	\
+	XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGINO_TO_INO)
+xfs_ino_t
+xfs_agino_to_ino(struct xfs_mount *mp, xfs_agnumber_t a, xfs_agino_t i);
+#define XFS_AGINO_TO_INO(mp,a,i)	xfs_agino_to_ino(mp,a,i)
+#else
+#define XFS_AGINO_TO_INO(mp,a,i)	\
+	(((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGINO_TO_AGBNO)
+xfs_agblock_t xfs_agino_to_agbno(struct xfs_mount *mp, xfs_agino_t i);
+#define XFS_AGINO_TO_AGBNO(mp,i)	xfs_agino_to_agbno(mp,i)
+#else
+#define XFS_AGINO_TO_AGBNO(mp,i)	((i) >> XFS_INO_OFFSET_BITS(mp))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_AGINO_TO_OFFSET)
+int xfs_agino_to_offset(struct xfs_mount *mp, xfs_agino_t i);
+#define XFS_AGINO_TO_OFFSET(mp,i)	xfs_agino_to_offset(mp,i)
+#else
+#define XFS_AGINO_TO_OFFSET(mp,i)	\
+	((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_OFFBNO_TO_AGINO)
+xfs_agino_t xfs_offbno_to_agino(struct xfs_mount *mp, xfs_agblock_t b, int o);
+#define XFS_OFFBNO_TO_AGINO(mp,b,o)	xfs_offbno_to_agino(mp,b,o)
+#else
+#define XFS_OFFBNO_TO_AGINO(mp,b,o)	\
+	((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
+#endif
+
+#if XFS_BIG_FILESYSTEMS
+#define XFS_MAXINUMBER		((xfs_ino_t)((1ULL << 56) - 1ULL))
+#define XFS_INO64_OFFSET	((xfs_ino_t)(1ULL << 32))
+#else
+#define XFS_MAXINUMBER		((xfs_ino_t)((1ULL << 32) - 1ULL))
+#endif
+#define XFS_MAXINUMBER_32	((xfs_ino_t)((1ULL << 32) - 1ULL))
+
+#endif	/* __XFS_INUM_H__ */
diff --git a/fs/xfs/xfs_iocore.c b/fs/xfs/xfs_iocore.c
new file mode 100644
index 000000000000..0cdfc32037d5
--- /dev/null
+++ b/fs/xfs/xfs_iocore.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+static xfs_fsize_t
+xfs_size_fn(
+	xfs_inode_t	*ip)
+{
+	return (ip->i_d.di_size);
+}
+
+xfs_ioops_t	xfs_iocore_xfs = {
+	.xfs_bmapi_func		= (xfs_bmapi_t) xfs_bmapi,
+	.xfs_bmap_eof_func	= (xfs_bmap_eof_t) xfs_bmap_eof,
+	.xfs_ilock		= (xfs_lock_t) xfs_ilock,
+	.xfs_ilock_demote	= (xfs_lock_demote_t) xfs_ilock_demote,
+	.xfs_ilock_nowait	= (xfs_lock_nowait_t) xfs_ilock_nowait,
+	.xfs_unlock		= (xfs_unlk_t) xfs_iunlock,
+	.xfs_size_func		= (xfs_size_t) xfs_size_fn,
+	.xfs_lastbyte		= (xfs_lastbyte_t) xfs_file_last_byte,
+};
+
+void
+xfs_iocore_inode_reinit(
+	xfs_inode_t	*ip)
+{
+	xfs_iocore_t	*io = &ip->i_iocore;
+
+	io->io_flags = XFS_IOCORE_ISXFS;
+	if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) {
+		io->io_flags |= XFS_IOCORE_RT;
+	}
+
+	io->io_dmevmask = ip->i_d.di_dmevmask;
+	io->io_dmstate = ip->i_d.di_dmstate;
+}
+
+void
+xfs_iocore_inode_init(
+	xfs_inode_t	*ip)
+{
+	xfs_iocore_t	*io = &ip->i_iocore;
+	xfs_mount_t	*mp = ip->i_mount;
+
+	io->io_mount = mp;
+#ifdef DEBUG
+	io->io_lock = &ip->i_lock;
+	io->io_iolock = &ip->i_iolock;
+#endif
+
+	io->io_obj = (void *)ip;
+
+	xfs_iocore_inode_reinit(ip);
+}
+
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
new file mode 100644
index 000000000000..d46f2ccf3844
--- /dev/null
+++ b/fs/xfs/xfs_itable.c
@@ -0,0 +1,770 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+/*
+ * Return stat information for one inode.
+ * Return 0 if ok, else errno.
+ */
+int					/* error status */
+xfs_bulkstat_one(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_ino_t	ino,		/* inode number to get data for */
+	void		*buffer,	/* buffer to place output in */
+	xfs_daddr_t	bno,		/* starting bno of inode cluster */
+	void		*dibuff,	/* on-disk inode buffer */
+	int		*stat)		/* BULKSTAT_RV_... */
+{
+	xfs_bstat_t	*buf;		/* return buffer */
+	int		error;		/* error value */
+	xfs_dinode_t	*dip;		/* dinode inode pointer */
+	xfs_dinode_core_t *dic;		/* dinode core info pointer */
+	xfs_inode_t	*ip = NULL;	/* incore inode pointer */
+	xfs_arch_t	arch;		/* these are set according to	   */
+	__uint16_t	di_flags;	/* temp */
+
+	buf = (xfs_bstat_t *)buffer;
+	dip = (xfs_dinode_t *)dibuff;
+
+	if (! buf || ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
+	    (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) &&
+	     (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino))) {
+		*stat = BULKSTAT_RV_NOTHING;
+		return XFS_ERROR(EINVAL);
+	}
+
+	if (dip == NULL) {
+		/* We're not being passed a pointer to a dinode.  This happens
+		 * if BULKSTAT_FG_IGET is selected.  Do the iget.
+		 */
+		error = xfs_iget(mp, tp, ino, XFS_ILOCK_SHARED, &ip, bno);
+		if (error) {
+			*stat = BULKSTAT_RV_NOTHING;
+			return error;
+		}
+		ASSERT(ip != NULL);
+		ASSERT(ip->i_blkno != (xfs_daddr_t)0);
+		if (ip->i_d.di_mode == 0) {
+			xfs_iput_new(ip, XFS_ILOCK_SHARED);
+			*stat = BULKSTAT_RV_NOTHING;
+			return XFS_ERROR(ENOENT);
+		}
+		dic = &ip->i_d;
+		arch = ARCH_NOCONVERT;		/* in-core! */
+		ASSERT(dic != NULL);
+
+		/* xfs_iget returns the following without needing
+		 * further change.
+		 */
+		buf->bs_nlink = dic->di_nlink;
+		buf->bs_projid = dic->di_projid;
+
+	} else {
+		dic = &dip->di_core;
+		ASSERT(dic != NULL);
+
+		/* buffer dinode_core is in on-disk arch */
+		arch = ARCH_CONVERT;
+
+		/*
+		 * The inode format changed when we moved the link count and
+		 * made it 32 bits long.  If this is an old format inode,
+		 * convert it in memory to look like a new one.	 If it gets
+		 * flushed to disk we will convert back before flushing or
+		 * logging it.	We zero out the new projid field and the old link
+		 * count field.	 We'll handle clearing the pad field (the remains
+		 * of the old uuid field) when we actually convert the inode to
+		 * the new format. We don't change the version number so that we
+		 * can distinguish this from a real new format inode.
+		 */
+		if (INT_GET(dic->di_version, arch) == XFS_DINODE_VERSION_1) {
+			buf->bs_nlink = INT_GET(dic->di_onlink, arch);
+			buf->bs_projid = 0;
+		}
+		else {
+			buf->bs_nlink = INT_GET(dic->di_nlink, arch);
+			buf->bs_projid = INT_GET(dic->di_projid, arch);
+		}
+
+	}
+
+	buf->bs_ino = ino;
+	buf->bs_mode = INT_GET(dic->di_mode, arch);
+	buf->bs_uid = INT_GET(dic->di_uid, arch);
+	buf->bs_gid = INT_GET(dic->di_gid, arch);
+	buf->bs_size = INT_GET(dic->di_size, arch);
+	buf->bs_atime.tv_sec = INT_GET(dic->di_atime.t_sec, arch);
+	buf->bs_atime.tv_nsec = INT_GET(dic->di_atime.t_nsec, arch);
+	buf->bs_mtime.tv_sec = INT_GET(dic->di_mtime.t_sec, arch);
+	buf->bs_mtime.tv_nsec = INT_GET(dic->di_mtime.t_nsec, arch);
+	buf->bs_ctime.tv_sec = INT_GET(dic->di_ctime.t_sec, arch);
+	buf->bs_ctime.tv_nsec = INT_GET(dic->di_ctime.t_nsec, arch);
+	/*
+	 * convert di_flags to bs_xflags.
+	 */
+	di_flags=INT_GET(dic->di_flags, arch);
+
+	buf->bs_xflags =
+		((di_flags & XFS_DIFLAG_REALTIME) ?
+			XFS_XFLAG_REALTIME : 0) |
+		((di_flags & XFS_DIFLAG_PREALLOC) ?
+			XFS_XFLAG_PREALLOC : 0) |
+		(XFS_CFORK_Q_ARCH(dic, arch) ?
+			XFS_XFLAG_HASATTR : 0);
+
+	buf->bs_extsize = INT_GET(dic->di_extsize, arch) << mp->m_sb.sb_blocklog;
+	buf->bs_extents = INT_GET(dic->di_nextents, arch);
+	buf->bs_gen = INT_GET(dic->di_gen, arch);
+	bzero(buf->bs_pad, sizeof(buf->bs_pad));
+	buf->bs_dmevmask = INT_GET(dic->di_dmevmask, arch);
+	buf->bs_dmstate = INT_GET(dic->di_dmstate, arch);
+	buf->bs_aextents = INT_GET(dic->di_anextents, arch);
+
+	switch (INT_GET(dic->di_format, arch)) {
+	case XFS_DINODE_FMT_DEV:
+		if ( ip ) {
+			buf->bs_rdev = ip->i_df.if_u2.if_rdev;
+		} else {
+			buf->bs_rdev = INT_GET(dip->di_u.di_dev, arch);
+		}
+
+		buf->bs_blksize = BLKDEV_IOSIZE;
+		buf->bs_blocks = 0;
+		break;
+	case XFS_DINODE_FMT_LOCAL:
+	case XFS_DINODE_FMT_UUID:
+		buf->bs_rdev = 0;
+		buf->bs_blksize = mp->m_sb.sb_blocksize;
+		buf->bs_blocks = 0;
+		break;
+	case XFS_DINODE_FMT_EXTENTS:
+	case XFS_DINODE_FMT_BTREE:
+		buf->bs_rdev = 0;
+		buf->bs_blksize = mp->m_sb.sb_blocksize;
+		if ( ip ) {
+			buf->bs_blocks = INT_GET(dic->di_nblocks, arch) + ip->i_delayed_blks;
+		} else {
+			buf->bs_blocks = INT_GET(dic->di_nblocks, arch);
+		}
+		break;
+	}
+
+	if (ip) {
+		xfs_iput(ip, XFS_ILOCK_SHARED);
+	}
+
+	*stat = BULKSTAT_RV_DIDONE;
+	return 0;
+}
+
+/*
+ * Return stat information in bulk (by-inode) for the filesystem.
+ */
+int					/* error status */
+xfs_bulkstat(
+	xfs_mount_t		*mp,	/* mount point for filesystem */
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_ino_t		*lastinop, /* last inode returned */
+	int			*ubcountp, /* size of buffer/count returned */
+	bulkstat_one_pf		formatter, /* func that'd fill a single buf */
+	size_t			statstruct_size, /* sizeof struct filling */
+	xfs_caddr_t		ubuffer, /* buffer with inode stats */
+	int			flags,	/* defined in xfs_itable.h */
+	int			*done)	/* 1 if there're more stats to get */
+{
+	xfs_agblock_t		agbno=0;/* allocation group block number */
+	xfs_buf_t		*agbp;	/* agi header buffer */
+	xfs_agi_t		*agi;	/* agi header data */
+	xfs_agino_t		agino;	/* inode # in allocation group */
+	xfs_agnumber_t		agno;	/* allocation group number */
+	xfs_daddr_t		bno;	/* inode cluster start daddr */
+	int			chunkidx; /* current index into inode chunk */
+	int			clustidx; /* current index into inode cluster */
+	xfs_btree_cur_t		*cur;	/* btree cursor for ialloc btree */
+	int			end_of_ag; /* set if we've seen the ag end */
+	int			error;	/* error code */
+	int			fmterror;/* bulkstat formatter result */
+	__int32_t		gcnt;	/* current btree rec's count */
+	xfs_inofree_t		gfree;	/* current btree rec's free mask */
+	xfs_agino_t		gino;	/* current btree rec's start inode */
+	int			i;	/* loop index */
+	int			icount; /* count of inodes good in irbuf */
+	xfs_ino_t		ino;	/* inode number (filesystem) */
+	xfs_inobt_rec_t		*irbp;	/* current irec buffer pointer */
+	xfs_inobt_rec_t		*irbuf; /* start of irec buffer */
+	xfs_inobt_rec_t		*irbufend; /* end of good irec buffer entries */
+	xfs_ino_t		lastino=0; /* last inode number returned */
+	int			nbcluster; /* # of blocks in a cluster */
+	int			nicluster; /* # of inodes in a cluster */
+	int			nimask; /* mask for inode clusters */
+	int			nirbuf; /* size of irbuf */
+	int			rval;	/* return value error code */
+	int			tmp;	/* result value from btree calls */
+	int			ubcount; /* size of user's buffer */
+	int			ubleft; /* spaces left in user's buffer */
+	xfs_caddr_t		ubufp;	/* current pointer into user's buffer */
+	xfs_buf_t		*bp;	/* ptr to on-disk inode cluster buf */
+	xfs_dinode_t		*dip;	/* ptr into bp for specific inode */
+	xfs_inode_t		*ip;	/* ptr to in-core inode struct */
+
+	/*
+	 * Get the last inode value, see if there's nothing to do.
+	 */
+	ino = (xfs_ino_t)*lastinop;
+	dip = NULL;
+	agno = XFS_INO_TO_AGNO(mp, ino);
+	agino = XFS_INO_TO_AGINO(mp, ino);
+	if (agno >= mp->m_sb.sb_agcount ||
+	    ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+		*done = 1;
+		*ubcountp = 0;
+		return 0;
+	}
+	ubcount = ubleft = *ubcountp;
+	*ubcountp = 0;
+	*done = 0;
+	fmterror = 0;
+	ubufp = ubuffer;
+	nicluster = mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp) ?
+		mp->m_sb.sb_inopblock :
+		(XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
+	nimask = ~(nicluster - 1);
+	nbcluster = nicluster >> mp->m_sb.sb_inopblog;
+	/*
+	 * Lock down the user's buffer. If a buffer was not sent, as in the case
+	 * disk quota code calls here, we skip this.
+	 */
+#if defined(HAVE_USERACC)
+	if (ubuffer &&
+	    (error = useracc(ubuffer, ubcount * statstruct_size,
+			(B_READ|B_PHYS), NULL))) {
+		return error;
+	}
+#endif
+	/*
+	 * Allocate a page-sized buffer for inode btree records.
+	 * We could try allocating something smaller, but for normal
+	 * calls we'll always (potentially) need the whole page.
+	 */
+	irbuf = kmem_alloc(NBPC, KM_SLEEP);
+	nirbuf = NBPC / sizeof(*irbuf);
+	/*
+	 * Loop over the allocation groups, starting from the last
+	 * inode returned; 0 means start of the allocation group.
+	 */
+	rval = 0;
+	while (ubleft > 0 && agno < mp->m_sb.sb_agcount) {
+		bp = NULL;
+		down_read(&mp->m_peraglock);
+		error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+		up_read(&mp->m_peraglock);
+		if (error) {
+			/*
+			 * Skip this allocation group and go to the next one.
+			 */
+			agno++;
+			agino = 0;
+			continue;
+		}
+		agi = XFS_BUF_TO_AGI(agbp);
+		/*
+		 * Allocate and initialize a btree cursor for ialloc btree.
+		 */
+		cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
+			(xfs_inode_t *)0, 0);
+		irbp = irbuf;
+		irbufend = irbuf + nirbuf;
+		end_of_ag = 0;
+		/*
+		 * If we're returning in the middle of an allocation group,
+		 * we need to get the remainder of the chunk we're in.
+		 */
+		if (agino > 0) {
+			/*
+			 * Lookup the inode chunk that this inode lives in.
+			 */
+			error = xfs_inobt_lookup_le(cur, agino, 0, 0, &tmp);
+			if (!error &&	/* no I/O error */
+			    tmp &&	/* lookup succeeded */
+					/* got the record, should always work */
+			    !(error = xfs_inobt_get_rec(cur, &gino, &gcnt,
+				    &gfree, &i, ARCH_NOCONVERT)) &&
+			    i == 1 &&
+					/* this is the right chunk */
+			    agino < gino + XFS_INODES_PER_CHUNK &&
+					/* lastino was not last in chunk */
+			    (chunkidx = agino - gino + 1) <
+				    XFS_INODES_PER_CHUNK &&
+					/* there are some left allocated */
+			    XFS_INOBT_MASKN(chunkidx,
+				    XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) {
+				/*
+				 * Grab the chunk record.  Mark all the
+				 * uninteresting inodes (because they're
+				 * before our start point) free.
+				 */
+				for (i = 0; i < chunkidx; i++) {
+					if (XFS_INOBT_MASK(i) & ~gfree)
+						gcnt++;
+				}
+				gfree |= XFS_INOBT_MASKN(0, chunkidx);
+				INT_SET(irbp->ir_startino, ARCH_CONVERT, gino);
+				INT_SET(irbp->ir_freecount, ARCH_CONVERT, gcnt);
+				INT_SET(irbp->ir_free, ARCH_CONVERT, gfree);
+				irbp++;
+				agino = gino + XFS_INODES_PER_CHUNK;
+				icount = XFS_INODES_PER_CHUNK - gcnt;
+			} else {
+				/*
+				 * If any of those tests failed, bump the
+				 * inode number (just in case).
+				 */
+				agino++;
+				icount = 0;
+			}
+			/*
+			 * In any case, increment to the next record.
+			 */
+			if (!error)
+				error = xfs_inobt_increment(cur, 0, &tmp);
+		} else {
+			/*
+			 * Start of ag.	 Lookup the first inode chunk.
+			 */
+			error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &tmp);
+			icount = 0;
+		}
+		/*
+		 * Loop through inode btree records in this ag,
+		 * until we run out of inodes or space in the buffer.
+		 */
+		while (irbp < irbufend && icount < ubcount) {
+			/*
+			 * Loop as long as we're unable to read the
+			 * inode btree.
+			 */
+			while (error) {
+				agino += XFS_INODES_PER_CHUNK;
+				if (XFS_AGINO_TO_AGBNO(mp, agino) >=
+						INT_GET(agi->agi_length, ARCH_CONVERT))
+					break;
+				error = xfs_inobt_lookup_ge(cur, agino, 0, 0,
+							    &tmp);
+			}
+			/*
+			 * If ran off the end of the ag either with an error,
+			 * or the normal way, set end and stop collecting.
+			 */
+			if (error ||
+			    (error = xfs_inobt_get_rec(cur, &gino, &gcnt,
+				    &gfree, &i, ARCH_NOCONVERT)) ||
+			    i == 0) {
+				end_of_ag = 1;
+				break;
+			}
+			/*
+			 * If this chunk has any allocated inodes, save it.
+			 */
+			if (gcnt < XFS_INODES_PER_CHUNK) {
+				INT_SET(irbp->ir_startino, ARCH_CONVERT, gino);
+				INT_SET(irbp->ir_freecount, ARCH_CONVERT, gcnt);
+				INT_SET(irbp->ir_free, ARCH_CONVERT, gfree);
+				irbp++;
+				icount += XFS_INODES_PER_CHUNK - gcnt;
+			}
+			/*
+			 * Set agino to after this chunk and bump the cursor.
+			 */
+			agino = gino + XFS_INODES_PER_CHUNK;
+			error = xfs_inobt_increment(cur, 0, &tmp);
+		}
+		/*
+		 * Drop the btree buffers and the agi buffer.
+		 * We can't hold any of the locks these represent
+		 * when calling iget.
+		 */
+		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+		xfs_trans_brelse(tp, agbp);
+		/*
+		 * Now format all the good inodes into the user's buffer.
+		 */
+		irbufend = irbp;
+		for (irbp = irbuf; irbp < irbufend && ubleft > 0; irbp++) {
+			/*
+			 * Read-ahead the next chunk's worth of inodes.
+			 */
+			if (&irbp[1] < irbufend) {
+				/*
+				 * Loop over all clusters in the next chunk.
+				 * Do a readahead if there are any allocated
+				 * inodes in that cluster.
+				 */
+				for (agbno = XFS_AGINO_TO_AGBNO(mp,
+							INT_GET(irbp[1].ir_startino, ARCH_CONVERT)),
+				     chunkidx = 0;
+				     chunkidx < XFS_INODES_PER_CHUNK;
+				     chunkidx += nicluster,
+				     agbno += nbcluster) {
+					if (XFS_INOBT_MASKN(chunkidx,
+							    nicluster) &
+					    ~(INT_GET(irbp[1].ir_free, ARCH_CONVERT)))
+						xfs_btree_reada_bufs(mp, agno,
+							agbno, nbcluster);
+				}
+			}
+			/*
+			 * Now process this chunk of inodes.
+			 */
+			for (agino = INT_GET(irbp->ir_startino, ARCH_CONVERT), chunkidx = 0, clustidx = 0;
+			     ubleft > 0 &&
+				INT_GET(irbp->ir_freecount, ARCH_CONVERT) < XFS_INODES_PER_CHUNK;
+			     chunkidx++, clustidx++, agino++) {
+				ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
+				/*
+				 * Recompute agbno if this is the
+				 * first inode of the cluster.
+				 *
+				 * Careful with clustidx.   There can be
+				 * multple clusters per chunk, a single
+				 * cluster per chunk or a cluster that has
+				 * inodes represented from several different
+				 * chunks (if blocksize is large).
+				 *
+				 * Because of this, the starting clustidx is
+				 * initialized to zero in this loop but must
+				 * later be reset after reading in the cluster
+				 * buffer.
+				 */
+				if ((chunkidx & (nicluster - 1)) == 0) {
+					agbno = XFS_AGINO_TO_AGBNO(mp,
+							INT_GET(irbp->ir_startino, ARCH_CONVERT)) +
+						((chunkidx & nimask) >>
+						 mp->m_sb.sb_inopblog);
+
+					if (flags & BULKSTAT_FG_QUICK) {
+						ino = XFS_AGINO_TO_INO(mp, agno,
+								       agino);
+						bno = XFS_AGB_TO_DADDR(mp, agno,
+								       agbno);
+
+						/*
+						 * Get the inode cluster buffer
+						 */
+						ASSERT(xfs_inode_zone != NULL);
+						ip = kmem_zone_zalloc(xfs_inode_zone,
+								      KM_SLEEP);
+						ip->i_ino = ino;
+						ip->i_mount = mp;
+						if (bp)
+							xfs_trans_brelse(tp, bp);
+						error = xfs_itobp(mp, tp, ip,
+								  &dip, &bp, bno);
+						if (!error)
+							clustidx = ip->i_boffset / mp->m_sb.sb_inodesize;
+						kmem_zone_free(xfs_inode_zone, ip);
+						if (XFS_TEST_ERROR(error != 0,
+								   mp, XFS_ERRTAG_BULKSTAT_READ_CHUNK,
+								   XFS_RANDOM_BULKSTAT_READ_CHUNK)) {
+							bp = NULL;
+							break;
+						}
+					}
+				}
+				/*
+				 * Skip if this inode is free.
+				 */
+				if (XFS_INOBT_MASK(chunkidx) & INT_GET(irbp->ir_free, ARCH_CONVERT))
+					continue;
+				/*
+				 * Count used inodes as free so we can tell
+				 * when the chunk is used up.
+				 */
+				INT_MOD(irbp->ir_freecount, ARCH_CONVERT, +1);
+				ino = XFS_AGINO_TO_INO(mp, agno, agino);
+				bno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+				if (flags & BULKSTAT_FG_QUICK) {
+					dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+					      (clustidx << mp->m_sb.sb_inodelog));
+
+					if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT)
+						    != XFS_DINODE_MAGIC
+					    || !XFS_DINODE_GOOD_VERSION(
+						    INT_GET(dip->di_core.di_version, ARCH_CONVERT)))
+						continue;
+				}
+
+				/*
+				 * Get the inode and fill in a single buffer.
+				 * BULKSTAT_FG_QUICK uses dip to fill it in.
+				 * BULKSTAT_FG_IGET uses igets.
+				 * See: xfs_bulkstat_one & dm_bulkstat_one.
+				 * This is also used to count inodes/blks, etc
+				 * in xfs_qm_quotacheck.
+				 */
+				error = formatter(mp, tp, ino, ubufp, bno, dip,
+					&fmterror);
+				if (fmterror == BULKSTAT_RV_NOTHING)
+					continue;
+				if (fmterror == BULKSTAT_RV_GIVEUP) {
+					ubleft = 0;
+					ASSERT(error);
+					rval = error;
+					break;
+				}
+				if (ubufp)
+					ubufp += statstruct_size;
+				ubleft--;
+				lastino = ino;
+			}
+		}
+
+		if (bp)
+			xfs_trans_brelse(tp, bp);
+
+		/*
+		 * Set up for the next loop iteration.
+		 */
+		if (ubleft > 0) {
+			if (end_of_ag) {
+				agno++;
+				agino = 0;
+			} else
+				agino = XFS_INO_TO_AGINO(mp, lastino);
+		} else
+			break;
+	}
+	/*
+	 * Done, we're either out of filesystem or space to put the data.
+	 */
+	kmem_free(irbuf, NBPC);
+#if defined(HAVE_USERACC)
+	if (ubuffer)
+		unuseracc(ubuffer, ubcount * statstruct_size, (B_READ|B_PHYS));
+#endif
+	*ubcountp = ubcount - ubleft;
+	if (agno >= mp->m_sb.sb_agcount) {
+		/*
+		 * If we ran out of filesystem, mark lastino as off
+		 * the end of the filesystem, so the next call
+		 * will return immediately.
+		 */
+		*lastinop = (xfs_ino_t)XFS_AGINO_TO_INO(mp, agno, 0);
+		*done = 1;
+	} else
+		*lastinop = (xfs_ino_t)lastino;
+
+	return rval;
+}
+
+/*
+ * Return stat information in bulk (by-inode) for the filesystem.
+ * Special case for non-sequential one inode bulkstat.
+ */
+int					/* error status */
+xfs_bulkstat_single(
+	xfs_mount_t		*mp,	/* mount point for filesystem */
+	xfs_ino_t		*lastinop, /* inode to return */
+	xfs_caddr_t		buffer, /* buffer with inode stats */
+	int			*done)	/* 1 if there're more stats to get */
+{
+	xfs_bstat_t		bstat;	/* one bulkstat result structure */
+	int			count;	/* count value for bulkstat call */
+	int			error;	/* return value */
+	xfs_ino_t		ino;	/* filesystem inode number */
+	int			res;	/* result from bs1 */
+
+	/*
+	 * note that requesting valid inode numbers which are not allocated
+	 * to inodes will most likely cause xfs_itobp to generate warning
+	 * messages about bad magic numbers. This is ok. The fact that
+	 * the inode isn't actually an inode is handled by the
+	 * error check below. Done this way to make the usual case faster
+	 * at the expense of the error case.
+	 */
+
+	ino = (xfs_ino_t)*lastinop;
+	error = xfs_bulkstat_one(mp, NULL, ino, &bstat, 0, 0, &res);
+	if (error) {
+		/*
+		 * Special case way failed, do it the "long" way
+		 * to see if that works.
+		 */
+		(*lastinop)--;
+		count = 1;
+		if (xfs_bulkstat(mp, NULL, lastinop, &count, xfs_bulkstat_one,
+				sizeof(bstat), buffer, BULKSTAT_FG_IGET, done))
+			return error;
+		if (count == 0 || (xfs_ino_t)*lastinop != ino)
+			return error == EFSCORRUPTED ?
+				XFS_ERROR(EINVAL) : error;
+		else
+			return 0;
+	}
+	*done = 0;
+	if (copy_to_user(buffer, &bstat, sizeof(bstat)))
+		return XFS_ERROR(EFAULT);
+	return 0;
+}
+
+/*
+ * Return inode number table for the filesystem.
+ */
+int					/* error status */
+xfs_inumbers(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_ino_t	*lastino,	/* last inode returned */
+	int		*count,		/* size of buffer/count returned */
+	xfs_caddr_t	ubuffer)	/* buffer with inode descriptions */
+{
+	xfs_buf_t	*agbp;
+	xfs_agino_t	agino;
+	xfs_agnumber_t	agno;
+	int		bcount;
+	xfs_inogrp_t	*buffer;
+	int		bufidx;
+	xfs_btree_cur_t *cur;
+	int		error;
+	__int32_t	gcnt;
+	xfs_inofree_t	gfree;
+	xfs_agino_t	gino;
+	int		i;
+	xfs_ino_t	ino;
+	int		left;
+	int		tmp;
+
+	ino = (xfs_ino_t)*lastino;
+	agno = XFS_INO_TO_AGNO(mp, ino);
+	agino = XFS_INO_TO_AGINO(mp, ino);
+	left = *count;
+	*count = 0;
+	bcount = MIN(left, (int)(NBPP / sizeof(*buffer)));
+	buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP);
+	error = bufidx = 0;
+	cur = NULL;
+	agbp = NULL;
+	while (left > 0 && agno < mp->m_sb.sb_agcount) {
+		if (agbp == NULL) {
+			down_read(&mp->m_peraglock);
+			error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+			up_read(&mp->m_peraglock);
+			if (error) {
+				/*
+				 * If we can't read the AGI of this ag,
+				 * then just skip to the next one.
+				 */
+				ASSERT(cur == NULL);
+				agbp = NULL;
+				agno++;
+				agino = 0;
+				continue;
+			}
+			cur = xfs_btree_init_cursor(mp, tp, agbp, agno,
+				XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
+			error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp);
+			if (error) {
+				xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+				cur = NULL;
+				xfs_trans_brelse(tp, agbp);
+				agbp = NULL;
+				/*
+				 * Move up the the last inode in the current
+				 * chunk.  The lookup_ge will always get
+				 * us the first inode in the next chunk.
+				 */
+				agino += XFS_INODES_PER_CHUNK - 1;
+				continue;
+			}
+		}
+		if ((error = xfs_inobt_get_rec(cur, &gino, &gcnt, &gfree,
+			&i, ARCH_NOCONVERT)) ||
+		    i == 0) {
+			xfs_trans_brelse(tp, agbp);
+			agbp = NULL;
+			xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+			cur = NULL;
+			agno++;
+			agino = 0;
+			continue;
+		}
+		agino = gino + XFS_INODES_PER_CHUNK - 1;
+		buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, gino);
+		buffer[bufidx].xi_alloccount = XFS_INODES_PER_CHUNK - gcnt;
+		buffer[bufidx].xi_allocmask = ~gfree;
+		bufidx++;
+		left--;
+		if (bufidx == bcount) {
+			if (copy_to_user(ubuffer, buffer,
+					bufidx * sizeof(*buffer))) {
+				error = XFS_ERROR(EFAULT);
+				break;
+			}
+			ubuffer += bufidx * sizeof(*buffer);
+			*count += bufidx;
+			bufidx = 0;
+		}
+		if (left) {
+			error = xfs_inobt_increment(cur, 0, &tmp);
+			if (error) {
+				xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+				cur = NULL;
+				xfs_trans_brelse(tp, agbp);
+				agbp = NULL;
+				/*
+				 * The agino value has already been bumped.
+				 * Just try to skip up to it.
+				 */
+				agino += XFS_INODES_PER_CHUNK;
+				continue;
+			}
+		}
+	}
+	if (!error) {
+		if (bufidx) {
+			if (copy_to_user(ubuffer, buffer,
+					bufidx * sizeof(*buffer)))
+				error = XFS_ERROR(EFAULT);
+			else
+				*count += bufidx;
+		}
+		*lastino = XFS_AGINO_TO_INO(mp, agno, agino);
+	}
+	kmem_free(buffer, bcount * sizeof(*buffer));
+	if (cur)
+		xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR :
+					   XFS_BTREE_NOERROR));
+	if (agbp)
+		xfs_trans_brelse(tp, agbp);
+	return error;
+}
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
new file mode 100644
index 000000000000..9e68317ecb7f
--- /dev/null
+++ b/fs/xfs/xfs_itable.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_ITABLE_H__
+#define __XFS_ITABLE_H__
+
+/*
+ * xfs_bulkstat() is used to fill in xfs_bstat structures as well as dm_stat
+ * structures (by the dmi library). This is a pointer to a formatter function
+ * that will iget the inode and fill in the appropriate structure.
+ * see xfs_bulkstat_one() and dm_bulkstat_one() in dmi_xfs.c
+ */
+typedef int (*bulkstat_one_pf)(struct xfs_mount *mp,
+			       struct xfs_trans *tp,
+			       xfs_ino_t	ino,
+			       void		*buffer,
+			       xfs_daddr_t	bno,
+			       void		*dip,
+			       int		*stat);
+/*
+ * Values for stat return value.
+ */
+#define BULKSTAT_RV_NOTHING	0
+#define BULKSTAT_RV_DIDONE	1
+#define BULKSTAT_RV_GIVEUP	2
+
+/*
+ * Values for bulkstat flag argument.
+ */
+#define BULKSTAT_FG_IGET	0x1	/* Go through the buffer cache */
+#define BULKSTAT_FG_QUICK	0x2	/* No iget, walk the dinode cluster */
+#define BULKSTAT_FG_VFSLOCKED	0x4	/* Already have vfs lock */
+
+/*
+ * Return stat information in bulk (by-inode) for the filesystem.
+ */
+int					/* error status */
+xfs_bulkstat(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_ino_t	*lastino,	/* last inode returned */
+	int		*count,		/* size of buffer/count returned */
+	bulkstat_one_pf formatter,	/* func that'd fill a single buf */
+	size_t		statstruct_size,/* sizeof struct that we're filling */
+	xfs_caddr_t	ubuffer,	/* buffer with inode stats */
+	int		flags,		/* flag to control access method */
+	int		*done);		/* 1 if there're more stats to get */
+
+int
+xfs_bulkstat_single(
+	xfs_mount_t		*mp,
+	xfs_ino_t		*lastinop,
+	xfs_caddr_t		buffer,
+	int			*done);
+
+int
+xfs_bulkstat_one(
+	xfs_mount_t		*mp,
+	xfs_trans_t		*tp,
+	xfs_ino_t		ino,
+	void			*buffer,
+	xfs_daddr_t		bno,
+	void			*dibuff,
+	int			*stat);
+
+int					/* error status */
+xfs_inumbers(
+	xfs_mount_t		*mp,	/* mount point for filesystem */
+	xfs_trans_t		*tp,	/* transaction pointer */
+	xfs_ino_t		*last,	/* last inode returned */
+	int			*count, /* size of buffer/count returned */
+	xfs_caddr_t		buffer);/* buffer with inode descriptions */
+
+#endif	/* __XFS_ITABLE_H__ */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
new file mode 100644
index 000000000000..ac4ced6985a7
--- /dev/null
+++ b/fs/xfs/xfs_log.c
@@ -0,0 +1,3628 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * High level interface routines for log manager
+ */
+
+#include <xfs.h>
+
+
+#define xlog_write_adv_cnt(ptr, len, off, bytes) \
+	{ (ptr) += (bytes); \
+	  (len) -= (bytes); \
+	  (off) += (bytes);}
+
+/* Local miscellaneous function prototypes */
+STATIC int	 xlog_bdstrat_cb(struct xfs_buf *);
+STATIC int	 xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket,
+				    xfs_lsn_t *);
+STATIC xlog_t *	 xlog_alloc_log(xfs_mount_t	*mp,
+				dev_t		log_dev,
+				xfs_daddr_t	blk_offset,
+				int		num_bblks);
+STATIC int	 xlog_space_left(xlog_t *log, int cycle, int bytes);
+STATIC int	 xlog_sync(xlog_t *log, xlog_in_core_t *iclog, uint flags);
+STATIC void	 xlog_unalloc_log(xlog_t *log);
+STATIC int	 xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[],
+			    int nentries, xfs_log_ticket_t tic,
+			    xfs_lsn_t *start_lsn, uint flags);
+
+/* local state machine functions */
+STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
+STATIC void xlog_state_do_callback(xlog_t *log,int aborted, xlog_in_core_t *iclog);
+static inline void xlog_state_finish_copy(xlog_t	*log,
+					xlog_in_core_t	*iclog,
+					int		first_write,
+					int		bytes);
+STATIC int  xlog_state_get_iclog_space(xlog_t		*log,
+				       int		len,
+				       xlog_in_core_t	**iclog,
+				       xlog_ticket_t	*ticket,
+				       int		*continued_write,
+				       int		*logoffsetp);
+STATIC int  xlog_state_lsn_is_synced(xlog_t		*log,
+				     xfs_lsn_t		lsn,
+				     xfs_log_callback_t *cb,
+				     int		*abortflg);
+STATIC void xlog_state_put_ticket(xlog_t	*log,
+				  xlog_ticket_t *tic);
+STATIC int  xlog_state_release_iclog(xlog_t		*log,
+				     xlog_in_core_t	*iclog);
+STATIC void xlog_state_switch_iclogs(xlog_t		*log,
+				     xlog_in_core_t *iclog,
+				     int		eventual_size);
+STATIC int  xlog_state_sync(xlog_t *log, xfs_lsn_t lsn, uint flags);
+STATIC int  xlog_state_sync_all(xlog_t *log, uint flags);
+STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog);
+
+/* local functions to manipulate grant head */
+STATIC int  xlog_grant_log_space(xlog_t		*log,
+				 xlog_ticket_t	*xtic);
+STATIC void xlog_grant_push_ail(xfs_mount_t	*mp,
+				int		need_bytes);
+STATIC void xlog_regrant_reserve_log_space(xlog_t	 *log,
+					   xlog_ticket_t *ticket);
+STATIC int xlog_regrant_write_log_space(xlog_t		*log,
+					 xlog_ticket_t	*ticket);
+STATIC void xlog_ungrant_log_space(xlog_t	 *log,
+				   xlog_ticket_t *ticket);
+
+
+/* local ticket functions */
+STATIC void		xlog_state_ticket_alloc(xlog_t *log);
+STATIC xlog_ticket_t	*xlog_ticket_get(xlog_t *log,
+					 int	unit_bytes,
+					 int	count,
+					 char	clientid,
+					 uint	flags);
+STATIC void		xlog_ticket_put(xlog_t *log, xlog_ticket_t *ticket);
+
+/* local debug functions */
+#if defined(DEBUG) && !defined(XLOG_NOLOG)
+STATIC void	xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr);
+#ifdef XFSDEBUG
+STATIC void	xlog_verify_disk_cycle_no(xlog_t *log, xlog_in_core_t *iclog);
+#endif
+STATIC void	xlog_verify_grant_head(xlog_t *log, int equals);
+STATIC void	xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog,
+				  int count, boolean_t syncing);
+STATIC void	xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
+				     xfs_lsn_t tail_lsn);
+#else
+#define xlog_verify_dest_ptr(a,b)
+#define xlog_verify_disk_cycle_no(a,b)
+#define xlog_verify_grant_head(a,b)
+#define xlog_verify_iclog(a,b,c,d)
+#define xlog_verify_tail_lsn(a,b,c)
+#endif
+
+int		xlog_iclogs_empty(xlog_t *log);
+
+#ifdef DEBUG
+int xlog_do_error = 0;
+int xlog_req_num  = 0;
+int xlog_error_mod = 33;
+#endif
+
+#define XLOG_FORCED_SHUTDOWN(log)	(log->l_flags & XLOG_IO_ERROR)
+
+/*
+ * 0 => disable log manager
+ * 1 => enable log manager
+ * 2 => enable log manager and log debugging
+ */
+#if defined(XLOG_NOLOG) || defined(DEBUG)
+int   xlog_debug = 1;
+dev_t xlog_devt	 = 0;
+#endif
+
+#if defined(XFS_LOG_TRACE)
+void
+xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
+{
+	if (! log->l_grant_trace)
+		log->l_grant_trace = ktrace_alloc(1024, KM_SLEEP);
+
+	ktrace_enter(log->l_grant_trace,
+		     (void *)tic,
+		     (void *)log->l_reserve_headq,
+		     (void *)log->l_write_headq,
+		     (void *)((unsigned long)log->l_grant_reserve_cycle),
+		     (void *)((unsigned long)log->l_grant_reserve_bytes),
+		     (void *)((unsigned long)log->l_grant_write_cycle),
+		     (void *)((unsigned long)log->l_grant_write_bytes),
+		     (void *)((unsigned long)log->l_curr_cycle),
+		     (void *)((unsigned long)log->l_curr_block),
+		     (void *)((unsigned long)CYCLE_LSN(log->l_tail_lsn, ARCH_NOCONVERT)),
+		     (void *)((unsigned long)BLOCK_LSN(log->l_tail_lsn, ARCH_NOCONVERT)),
+		     (void *)string,
+		     (void *)((unsigned long)13),
+		     (void *)((unsigned long)14),
+		     (void *)((unsigned long)15),
+		     (void *)((unsigned long)16));
+}
+
+void
+xlog_trace_tic(xlog_t *log, xlog_ticket_t *tic)
+{
+	if (! log->l_trace)
+		log->l_trace = ktrace_alloc(256, KM_SLEEP);
+
+	ktrace_enter(log->l_trace,
+		     (void *)tic,
+		     (void *)((unsigned long)tic->t_curr_res),
+		     (void *)((unsigned long)tic->t_unit_res),
+		     (void *)((unsigned long)tic->t_ocnt),
+		     (void *)((unsigned long)tic->t_cnt),
+		     (void *)((unsigned long)tic->t_flags),
+		     (void *)((unsigned long)7),
+		     (void *)((unsigned long)8),
+		     (void *)((unsigned long)9),
+		     (void *)((unsigned long)10),
+		     (void *)((unsigned long)11),
+		     (void *)((unsigned long)12),
+		     (void *)((unsigned long)13),
+		     (void *)((unsigned long)14),
+		     (void *)((unsigned long)15),
+		     (void *)((unsigned long)16));
+}
+
+void
+xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
+{
+	pid_t pid;
+
+	pid = current_pid();
+
+	if (!iclog->ic_trace)
+		iclog->ic_trace = ktrace_alloc(256, KM_SLEEP);
+	ktrace_enter(iclog->ic_trace,
+		     (void *)((unsigned long)state),
+		     (void *)((unsigned long)pid),
+		     (void *)0,
+		     (void *)0,
+		     (void *)0,
+		     (void *)0,
+		     (void *)0,
+		     (void *)0,
+		     (void *)0,
+		     (void *)0,
+		     (void *)0,
+		     (void *)0,
+		     (void *)0,
+		     (void *)0,
+		     (void *)0,
+		     (void *)0);
+}
+
+#else
+#define xlog_trace_loggrant(log,tic,string)
+#define xlog_trace_iclog(iclog,state)
+#endif /* XFS_LOG_TRACE */
+
+/*
+ * NOTES:
+ *
+ *	1. currblock field gets updated at startup and after in-core logs
+ *		marked as with WANT_SYNC.
+ */
+
+/*
+ * This routine is called when a user of a log manager ticket is done with
+ * the reservation.  If the ticket was ever used, then a commit record for
+ * the associated transaction is written out as a log operation header with
+ * no data.  The flag XLOG_TIC_INITED is set when the first write occurs with
+ * a given ticket.  If the ticket was one with a permanent reservation, then
+ * a few operations are done differently.  Permanent reservation tickets by
+ * default don't release the reservation.  They just commit the current
+ * transaction with the belief that the reservation is still needed.  A flag
+ * must be passed in before permanent reservations are actually released.
+ * When these type of tickets are not released, they need to be set into
+ * the inited state again.  By doing this, a start record will be written
+ * out when the next write occurs.
+ */
+xfs_lsn_t
+xfs_log_done(xfs_mount_t	*mp,
+	     xfs_log_ticket_t	xtic,
+	     uint		flags)
+{
+	xlog_t		*log	= mp->m_log;
+	xlog_ticket_t	*ticket = (xfs_log_ticket_t) xtic;
+	xfs_lsn_t	lsn	= 0;
+
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+	if (! xlog_debug && xlog_devt == log->l_dev)
+		return 0;
+#endif
+
+	if (XLOG_FORCED_SHUTDOWN(log) ||
+	    /*
+	     * If nothing was ever written, don't write out commit record.
+	     * If we get an error, just continue and give back the log ticket.
+	     */
+	    (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
+	     (xlog_commit_record(mp, ticket, &lsn)))) {
+		lsn = (xfs_lsn_t) -1;
+		if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
+			flags |= XFS_LOG_REL_PERM_RESERV;
+		}
+	}
+
+
+	if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 ||
+	    (flags & XFS_LOG_REL_PERM_RESERV)) {
+		/*
+		 * Release ticket if not permanent reservation or a specifc
+		 * request has been made to release a permanent reservation.
+		 */
+		xlog_ungrant_log_space(log, ticket);
+		xlog_state_put_ticket(log, ticket);
+	} else {
+		xlog_regrant_reserve_log_space(log, ticket);
+	}
+
+	/* If this ticket was a permanent reservation and we aren't
+	 * trying to release it, reset the inited flags; so next time
+	 * we write, a start record will be written out.
+	 */
+	if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) &&
+	    (flags & XFS_LOG_REL_PERM_RESERV) == 0)
+		ticket->t_flags |= XLOG_TIC_INITED;
+
+	return lsn;
+}	/* xfs_log_done */
+
+
+/*
+ * Force the in-core log to disk.  If flags == XFS_LOG_SYNC,
+ *	the force is done synchronously.
+ *
+ * Asynchronous forces are implemented by setting the WANT_SYNC
+ * bit in the appropriate in-core log and then returning.
+ *
+ * Synchronous forces are implemented with a semaphore.	 All callers
+ * to force a given lsn to disk will wait on a semaphore attached to the
+ * specific in-core log.  When given in-core log finally completes its
+ * write to disk, that thread will wake up all threads waiting on the
+ * semaphore.
+ */
+int
+xfs_log_force(xfs_mount_t *mp,
+	      xfs_lsn_t	  lsn,
+	      uint	  flags)
+{
+	int	rval;
+	xlog_t *log = mp->m_log;
+
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+	if (! xlog_debug && xlog_devt == log->l_dev)
+		return 0;
+#endif
+
+	ASSERT(flags & XFS_LOG_FORCE);
+
+	XFS_STATS_INC(xfsstats.xs_log_force);
+
+	if ((log->l_flags & XLOG_IO_ERROR) == 0) {
+		if (lsn == 0)
+			rval = xlog_state_sync_all(log, flags);
+		else
+			rval = xlog_state_sync(log, lsn, flags);
+	} else {
+		rval = XFS_ERROR(EIO);
+	}
+
+	return rval;
+
+}	/* xfs_log_force */
+
+
+/*
+ * This function will take a log sequence number and check to see if that
+ * lsn has been flushed to disk.  If it has, then the callback function is
+ * called with the callback argument.  If the relevant in-core log has not
+ * been synced to disk, we add the callback to the callback list of the
+ * in-core log.
+ */
+void
+xfs_log_notify(xfs_mount_t	  *mp,		/* mount of partition */
+	       xfs_lsn_t	  lsn,		/* lsn looking for */
+	       xfs_log_callback_t *cb)
+{
+	xlog_t *log = mp->m_log;
+	int	abortflg;
+
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+	if (! xlog_debug && xlog_devt == log->l_dev)
+		return;
+#endif
+	cb->cb_next = 0;
+	if (xlog_state_lsn_is_synced(log, lsn, cb, &abortflg))
+		cb->cb_func(cb->cb_arg, abortflg);
+}	/* xfs_log_notify */
+
+
+/*
+ * Initialize log manager data.	 This routine is intended to be called when
+ * a system boots up.  It is not a per filesystem initialization.
+ *
+ * As you can see, we currently do nothing.
+ */
+int
+xfs_log_init(void)
+{
+	return( 0 );
+}
+
+
+/*
+ *  1. Reserve an amount of on-disk log space and return a ticket corresponding
+ *	to the reservation.
+ *  2. Potentially, push buffers at tail of log to disk.
+ *
+ * Each reservation is going to reserve extra space for a log record header.
+ * When writes happen to the on-disk log, we don't subtract the length of the
+ * log record header from any reservation.  By wasting space in each
+ * reservation, we prevent over allocation problems.
+ */
+int
+xfs_log_reserve(xfs_mount_t	 *mp,
+		int		 unit_bytes,
+		int		 cnt,
+		xfs_log_ticket_t *ticket,
+		__uint8_t	 client,
+		uint		 flags)
+{
+	xlog_t		*log = mp->m_log;
+	xlog_ticket_t	*internal_ticket;
+	int		retval;
+
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+	if (! xlog_debug && xlog_devt == log->l_dev)
+		return 0;
+#endif
+	retval = 0;
+	ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
+	ASSERT((flags & XFS_LOG_NOSLEEP) == 0);
+
+	if (XLOG_FORCED_SHUTDOWN(log))
+		return XFS_ERROR(EIO);
+
+	XFS_STATS_INC(xfsstats.xs_try_logspace);
+
+	if (*ticket != NULL) {
+		ASSERT(flags & XFS_LOG_PERM_RESERV);
+		internal_ticket = (xlog_ticket_t *)*ticket;
+		xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
+		retval = xlog_regrant_write_log_space(log, internal_ticket);
+	} else {
+		/* may sleep if need to allocate more tickets */
+		internal_ticket = xlog_ticket_get(log, unit_bytes, cnt,
+						  client, flags);
+		*ticket = internal_ticket;
+		xlog_grant_push_ail(mp,
+				    (internal_ticket->t_unit_res *
+				     internal_ticket->t_cnt));
+		retval = xlog_grant_log_space(log, internal_ticket);
+	}
+
+	return retval;
+}	/* xfs_log_reserve */
+
+
+/*
+ * Mount a log filesystem
+ *
+ * mp		- ubiquitous xfs mount point structure
+ * log_dev	- device number of on-disk log device
+ * blk_offset	- Start block # where block size is 512 bytes (BBSIZE)
+ * num_bblocks	- Number of BBSIZE blocks in on-disk log
+ *
+ * Return error or zero.
+ */
+int
+xfs_log_mount(xfs_mount_t	*mp,
+	      dev_t		log_dev,
+	      xfs_daddr_t	blk_offset,
+	      int		num_bblks)
+{
+	xlog_t *log;
+
+	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
+		cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname);
+	else {
+		cmn_err(CE_NOTE,
+			"!Mounting filesystem \"%s\" in no-recovery mode.  Filesystem will be inconsistent.",
+			mp->m_fsname);
+		ASSERT(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY);
+	}
+
+	mp->m_log = log = xlog_alloc_log(mp, log_dev, blk_offset, num_bblks);
+
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+	if (! xlog_debug) {
+		cmn_err(CE_NOTE, "log dev: 0x%x", log_dev);
+		return 0;
+	}
+#endif
+	/*
+	 * skip log recovery on a norecovery mount.  pretend it all
+	 * just worked.
+	 */
+	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
+		int    error;
+
+		error = xlog_recover(log,XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY);
+		if (error) {
+			cmn_err(CE_WARN, "XFS: log mount/recovery failed");
+			xlog_unalloc_log(log);
+			return error;
+		}
+	}
+
+	/* Normal transactions can now occur */
+	log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
+
+	/* End mounting message in xfs_log_mount_finish */
+	return 0;
+}	/* xfs_log_mount */
+
+/*
+ * Finish the recovery of the file system.  This is separate from
+ * the xfs_log_mount() call, because it depends on the code in
+ * xfs_mountfs() to read in the root and real-time bitmap inodes
+ * between calling xfs_log_mount() and here.
+ *
+ * mp		- ubiquitous xfs mount point structure
+ */
+int
+xfs_log_mount_finish(xfs_mount_t *mp, int mfsi_flags)
+{
+	int	error;
+
+	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
+		error = xlog_recover_finish(mp->m_log, mfsi_flags);
+	else {
+		error = 0;
+		ASSERT(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY);
+	}
+
+	return error;
+}
+
+/*
+ * Unmount processing for the log.
+ */
+int
+xfs_log_unmount(xfs_mount_t *mp)
+{
+	int		error;
+
+	error = xfs_log_unmount_write(mp);
+	xfs_log_unmount_dealloc(mp);
+	return (error);
+}
+
+/*
+ * Final log writes as part of unmount.
+ *
+ * Mark the filesystem clean as unmount happens.  Note that during relocation
+ * this routine needs to be executed as part of source-bag while the
+ * deallocation must not be done until source-end.
+ */
+
+/*
+ * Unmount record used to have a string "Unmount filesystem--" in the
+ * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
+ * We just write the magic number now since that particular field isn't
+ * currently architecture converted and "nUmount" is a bit foo.
+ * As far as I know, there weren't any dependencies on the old behaviour.
+ */
+
+int
+xfs_log_unmount_write(xfs_mount_t *mp)
+{
+	xlog_t		 *log = mp->m_log;
+	xlog_in_core_t	 *iclog;
+#ifdef DEBUG
+	xlog_in_core_t	 *first_iclog;
+#endif
+	xfs_log_iovec_t	 reg[1];
+	xfs_log_ticket_t tic = 0;
+	xfs_lsn_t	 lsn;
+	int		 error;
+	SPLDECL(s);
+
+	/* the data section must be 32 bit size aligned */
+	struct {
+	    __uint16_t magic;
+	    __uint16_t pad1;
+	    __uint32_t pad2; /* may as well make it 64 bits */
+	} magic = { XLOG_UNMOUNT_TYPE, 0, 0 };
+
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+	if (! xlog_debug && xlog_devt == log->l_dev)
+		return 0;
+#endif
+
+	/*
+	 * Don't write out unmount record on read-only mounts.
+	 * Or, if we are doing a forced umount (typically because of IO errors).
+	 */
+	if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY)
+		return 0;
+
+	xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC);
+
+#ifdef DEBUG
+	first_iclog = iclog = log->l_iclog;
+	do {
+		if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
+			ASSERT(iclog->ic_state & XLOG_STATE_ACTIVE);
+			ASSERT(iclog->ic_offset == 0);
+		}
+		iclog = iclog->ic_next;
+	} while (iclog != first_iclog);
+#endif
+	if (! (XLOG_FORCED_SHUTDOWN(log))) {
+		reg[0].i_addr = (void*)&magic;
+		reg[0].i_len  = sizeof(magic);
+
+		error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0);
+		if (!error) {
+			/* remove inited flag */
+			((xlog_ticket_t *)tic)->t_flags = 0;
+			error = xlog_write(mp, reg, 1, tic, &lsn,
+					   XLOG_UNMOUNT_TRANS);
+			/*
+			 * At this point, we're umounting anyway,
+			 * so there's no point in transitioning log state
+			 * to IOERROR. Just continue...
+			 */
+		}
+
+		if (error) {
+			xfs_fs_cmn_err(CE_ALERT, mp,
+				"xfs_log_unmount: unmount record failed");
+		}
+
+
+		s = LOG_LOCK(log);
+		iclog = log->l_iclog;
+		iclog->ic_refcnt++;
+		LOG_UNLOCK(log, s);
+		xlog_state_want_sync(log, iclog);
+		(void) xlog_state_release_iclog(log, iclog);
+
+		s = LOG_LOCK(log);
+		if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
+		      iclog->ic_state == XLOG_STATE_DIRTY)) {
+			if (!XLOG_FORCED_SHUTDOWN(log)) {
+				sv_wait(&iclog->ic_forcesema, PMEM,
+					&log->l_icloglock, s);
+			} else {
+				LOG_UNLOCK(log, s);
+			}
+		} else {
+			LOG_UNLOCK(log, s);
+		}
+		if (tic)
+			xlog_state_put_ticket(log, tic);
+	} else {
+		/*
+		 * We're already in forced_shutdown mode, couldn't
+		 * even attempt to write out the unmount transaction.
+		 *
+		 * Go through the motions of sync'ing and releasing
+		 * the iclog, even though no I/O will actually happen,
+		 * we need to wait for other log I/O's that may already
+		 * be in progress.  Do this as a separate section of
+		 * code so we'll know if we ever get stuck here that
+		 * we're in this odd situation of trying to unmount
+		 * a file system that went into forced_shutdown as
+		 * the result of an unmount..
+		 */
+		s = LOG_LOCK(log);
+		iclog = log->l_iclog;
+		iclog->ic_refcnt++;
+		LOG_UNLOCK(log, s);
+
+		xlog_state_want_sync(log, iclog);
+		(void) xlog_state_release_iclog(log, iclog);
+
+		s = LOG_LOCK(log);
+
+		if ( ! (   iclog->ic_state == XLOG_STATE_ACTIVE
+			|| iclog->ic_state == XLOG_STATE_DIRTY
+			|| iclog->ic_state == XLOG_STATE_IOERROR) ) {
+
+				sv_wait(&iclog->ic_forcesema, PMEM,
+					&log->l_icloglock, s);
+		} else {
+			LOG_UNLOCK(log, s);
+		}
+	}
+
+	return 0;
+}	/* xfs_log_unmount_write */
+
+/*
+ * Deallocate log structures for unmount/relocation.
+ */
+void
+xfs_log_unmount_dealloc(xfs_mount_t *mp)
+{
+	xlog_unalloc_log(mp->m_log);
+}
+
+/*
+ * Write region vectors to log.	 The write happens using the space reservation
+ * of the ticket (tic).	 It is not a requirement that all writes for a given
+ * transaction occur with one call to xfs_log_write().
+ */
+int
+xfs_log_write(xfs_mount_t *	mp,
+	      xfs_log_iovec_t	reg[],
+	      int		nentries,
+	      xfs_log_ticket_t	tic,
+	      xfs_lsn_t		*start_lsn)
+{
+	int	error;
+	xlog_t *log = mp->m_log;
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+
+	if (! xlog_debug && xlog_devt == log->l_dev) {
+		*start_lsn = 0;
+		return 0;
+	}
+#endif
+	if (XLOG_FORCED_SHUTDOWN(log))
+		return XFS_ERROR(EIO);
+
+	if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, 0))) {
+		xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
+	}
+	return (error);
+}	/* xfs_log_write */
+
+
+void
+xfs_log_move_tail(xfs_mount_t	*mp,
+		  xfs_lsn_t	tail_lsn)
+{
+	xlog_ticket_t	*tic;
+	xlog_t		*log = mp->m_log;
+	int		need_bytes, free_bytes, cycle, bytes;
+	SPLDECL(s);
+
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+	if (!xlog_debug && xlog_devt == log->l_dev)
+		return;
+#endif
+	/* XXXsup tmp */
+	if (XLOG_FORCED_SHUTDOWN(log))
+		return;
+	ASSERT(!XFS_FORCED_SHUTDOWN(mp));
+
+	if (tail_lsn == 0) {
+		/* needed since sync_lsn is 64 bits */
+		s = LOG_LOCK(log);
+		tail_lsn = log->l_last_sync_lsn;
+		LOG_UNLOCK(log, s);
+	}
+
+	s = GRANT_LOCK(log);
+
+	/* Also an illegal lsn.	 1 implies that we aren't passing in a legal
+	 * tail_lsn.
+	 */
+	if (tail_lsn != 1)
+		log->l_tail_lsn = tail_lsn;
+
+	if ((tic = log->l_write_headq)) {
+#ifdef DEBUG
+		if (log->l_flags & XLOG_ACTIVE_RECOVERY)
+			panic("Recovery problem");
+#endif
+		cycle = log->l_grant_write_cycle;
+		bytes = log->l_grant_write_bytes;
+		free_bytes = xlog_space_left(log, cycle, bytes);
+		do {
+			ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
+
+			if (free_bytes < tic->t_unit_res)
+				break;
+			free_bytes -= tic->t_unit_res;
+			sv_signal(&tic->t_sema);
+			tic = tic->t_next;
+		} while (tic != log->l_write_headq);
+	}
+	if ((tic = log->l_reserve_headq)) {
+#ifdef DEBUG
+		if (log->l_flags & XLOG_ACTIVE_RECOVERY)
+			panic("Recovery problem");
+#endif
+		cycle = log->l_grant_reserve_cycle;
+		bytes = log->l_grant_reserve_bytes;
+		free_bytes = xlog_space_left(log, cycle, bytes);
+		do {
+			if (tic->t_flags & XLOG_TIC_PERM_RESERV)
+				need_bytes = tic->t_unit_res*tic->t_cnt;
+			else
+				need_bytes = tic->t_unit_res;
+			if (free_bytes < need_bytes)
+				break;
+			free_bytes -= need_bytes;
+			sv_signal(&tic->t_sema);
+			tic = tic->t_next;
+		} while (tic != log->l_reserve_headq);
+	}
+	GRANT_UNLOCK(log, s);
+}	/* xfs_log_move_tail */
+
+/*
+ * Determine if we have a transaction that has gone to disk
+ * that needs to be covered. Log activity needs to be idle (no AIL and
+ * nothing in the iclogs). And, we need to be in the right state indicating
+ * something has gone out.
+ */
+int
+xfs_log_need_covered(xfs_mount_t *mp)
+{
+	SPLDECL(s);
+	int		needed = 0, gen;
+	xlog_t		*log = mp->m_log;
+
+	if (mp->m_frozen || XFS_FORCED_SHUTDOWN(mp))
+		return 0;
+
+	s = LOG_LOCK(log);
+	if (((log->l_covered_state == XLOG_STATE_COVER_NEED) ||
+		(log->l_covered_state == XLOG_STATE_COVER_NEED2))
+			&& !xfs_trans_first_ail(mp, &gen)
+			&& xlog_iclogs_empty(log)) {
+		if (log->l_covered_state == XLOG_STATE_COVER_NEED)
+			log->l_covered_state = XLOG_STATE_COVER_DONE;
+		else {
+			ASSERT(log->l_covered_state == XLOG_STATE_COVER_NEED2);
+			log->l_covered_state = XLOG_STATE_COVER_DONE2;
+		}
+		needed = 1;
+	}
+	LOG_UNLOCK(log, s);
+	return(needed);
+}
+
+/******************************************************************************
+ *
+ *	local routines
+ *
+ ******************************************************************************
+ */
+
+/* xfs_trans_tail_ail returns 0 when there is nothing in the list.
+ * The log manager must keep track of the last LR which was committed
+ * to disk.  The lsn of this LR will become the new tail_lsn whenever
+ * xfs_trans_tail_ail returns 0.  If we don't do this, we run into
+ * the situation where stuff could be written into the log but nothing
+ * was ever in the AIL when asked.  Eventually, we panic since the
+ * tail hits the head.
+ *
+ * We may be holding the log iclog lock upon entering this routine.
+ */
+xfs_lsn_t
+xlog_assign_tail_lsn(xfs_mount_t *mp, xlog_in_core_t *iclog)
+{
+	xfs_lsn_t tail_lsn;
+	SPLDECL(s);
+	xlog_t	  *log = mp->m_log;
+
+	tail_lsn = xfs_trans_tail_ail(mp);
+	s = GRANT_LOCK(log);
+	if (tail_lsn != 0)
+		log->l_tail_lsn = tail_lsn;
+	else
+		tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn;
+	if (iclog)
+		INT_SET(iclog->ic_header.h_tail_lsn, ARCH_CONVERT, tail_lsn);
+	GRANT_UNLOCK(log, s);
+
+	return tail_lsn;
+}	/* xlog_assign_tail_lsn */
+
+
+/*
+ * Return the space in the log between the tail and the head.  The head
+ * is passed in the cycle/bytes formal parms.  In the special case where
+ * the reserve head has wrapped passed the tail, this calculation is no
+ * longer valid.  In this case, just return 0 which means there is no space
+ * in the log.	This works for all places where this function is called
+ * with the reserve head.  Of course, if the write head were to ever
+ * wrap the tail, we should blow up.  Rather than catch this case here,
+ * we depend on other ASSERTions in other parts of the code.   XXXmiken
+ *
+ * This code also handles the case where the reservation head is behind
+ * the tail.  The details of this case are described below, but the end
+ * result is that we return the size of the log as the amount of space left.
+ */
+int
+xlog_space_left(xlog_t *log, int cycle, int bytes)
+{
+	int free_bytes;
+	int tail_bytes;
+	int tail_cycle;
+
+	tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn, ARCH_NOCONVERT));
+	tail_cycle = CYCLE_LSN(log->l_tail_lsn, ARCH_NOCONVERT);
+	if ((tail_cycle == cycle) && (bytes >= tail_bytes)) {
+		free_bytes = log->l_logsize - (bytes - tail_bytes);
+	} else if ((tail_cycle + 1) < cycle) {
+		return 0;
+	} else if (tail_cycle < cycle) {
+		ASSERT(tail_cycle == (cycle - 1));
+		free_bytes = tail_bytes - bytes;
+	} else {
+		/*
+		 * The reservation head is behind the tail.
+		 * This can only happen when the AIL is empty so the tail
+		 * is equal to the head and the l_roundoff value in the
+		 * log structure is taking up the difference between the
+		 * reservation head and the tail.  The bytes accounted for
+		 * by the l_roundoff field are temporarily 'lost' to the
+		 * reservation mechanism, but they are cleaned up when the
+		 * log buffers that created them are reused.  These lost
+		 * bytes are what allow the reservation head to fall behind
+		 * the tail in the case that the log is 'empty'.
+		 * In this case we just want to return the size of the
+		 * log as the amount of space left.
+		 */
+/* This assert does not take into account padding from striped log writes *
+		ASSERT((tail_cycle == (cycle + 1)) ||
+		       ((bytes + log->l_roundoff) >= tail_bytes));
+*/
+		free_bytes = log->l_logsize;
+	}
+	return free_bytes;
+}	/* xlog_space_left */
+
+
+/*
+ * Log function which is called when an io completes.
+ *
+ * The log manager needs its own routine, in order to control what
+ * happens with the buffer after the write completes.
+ */
+void
+xlog_iodone(xfs_buf_t *bp)
+{
+	xlog_in_core_t *iclog;
+	int		aborted;
+
+	iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long) 2);
+	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
+	aborted = 0;
+
+	/*
+	 * Race to shutdown the filesystem if we see an error.
+	 */
+	if (XFS_BUF_GETERROR(bp)) {
+		/* Some versions of cpp barf on the recursive definition of
+		 * ic_log -> hic_fields.ic_log and expand ic_log twice when
+		 * it is passed through two macros.  Workaround for broken cpp
+		 */
+		struct log *l;
+		xfs_ioerror_alert("xlog_iodone",
+				  iclog->ic_log->l_mp, bp, XFS_BUF_ADDR(bp));
+		XFS_BUF_STALE(bp);
+		l = iclog->ic_log;
+		xfs_force_shutdown(l->l_mp, XFS_LOG_IO_ERROR);
+		/*
+		 * This flag will be propagated to the trans-committed
+		 * callback routines to let them know that the log-commit
+		 * didn't succeed.
+		 */
+		aborted = XFS_LI_ABORTED;
+	} else if (iclog->ic_state & XLOG_STATE_IOERROR) {
+		aborted = XFS_LI_ABORTED;
+	}
+	xlog_state_done_syncing(iclog, aborted);
+	if ( !(XFS_BUF_ISASYNC(bp)) ) {
+		/*
+		 * Corresponding psema() will be done in bwrite().  If we don't
+		 * vsema() here, panic.
+		 */
+	  XFS_BUF_V_IODONESEMA(bp);
+	}
+}	/* xlog_iodone */
+
+/*
+ * The bdstrat callback function for log bufs. This gives us a central
+ * place to trap bufs in case we get hit by a log I/O error and need to
+ * shutdown. Actually, in practice, even when we didn't get a log error,
+ * we transition the iclogs to IOERROR state *after* flushing all existing
+ * iclogs to disk. This is because we don't want anymore new transactions to be
+ * started or completed afterwards.
+ */
+STATIC int
+xlog_bdstrat_cb(struct xfs_buf *bp)
+{
+	xlog_in_core_t *iclog;
+
+	iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *);
+
+	if ((iclog->ic_state & XLOG_STATE_IOERROR) == 0) {
+	  /* note for irix bstrat will need  struct bdevsw passed
+	   * Fix the following macro if the code ever is merged
+	   */
+	    XFS_bdstrat(bp);
+		return 0;
+	}
+
+	xfs_buftrace("XLOG__BDSTRAT IOERROR", bp);
+	XFS_BUF_ERROR(bp, EIO);
+	XFS_BUF_STALE(bp);
+	xfs_biodone(bp);
+	return (XFS_ERROR(EIO));
+
+
+}
+
+/*
+ * Return size of each in-core log record buffer.
+ *
+ * Low memory machines only get 2 16KB buffers.	 We don't want to waste
+ * memory here.	 However, all other machines get at least 2 32KB buffers.
+ * The number is hard coded because we don't care about the minimum
+ * memory size, just 32MB systems.
+ *
+ * If the filesystem blocksize is too large, we may need to choose a
+ * larger size since the directory code currently logs entire blocks.
+ * XXXmiken XXXcurtis
+ */
+
+STATIC void
+xlog_get_iclog_buffer_size(xfs_mount_t	*mp,
+			   xlog_t	*log)
+{
+	int size;
+	int xhdrs;
+
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+	/*
+	 * When logbufs == 0, someone has disabled the log from the FSTAB
+	 * file.  This is not a documented feature.  We need to set xlog_debug
+	 * to zero (this deactivates the log) and set xlog_devt to the
+	 * appropriate dev_t.  Only one filesystem may be affected as such
+	 * since this is just a performance hack to test what we might be able
+	 * to get if the log were not present.
+	 */
+	if (mp->m_logbufs == 0) {
+		xlog_debug = 0;
+		xlog_devt = log->l_dev;
+		log->l_iclog_bufs = XLOG_NUM_ICLOGS;
+	} else
+#endif
+	{
+		/*
+		 * This is the normal path.  If m_logbufs == -1, then the
+		 * admin has chosen to use the system defaults for logbuffers.
+		 */
+		if (mp->m_logbufs == -1)
+			log->l_iclog_bufs = XLOG_NUM_ICLOGS;
+		else
+			log->l_iclog_bufs = mp->m_logbufs;
+
+#if defined(DEBUG) || defined(XLOG_NOLOG)
+		/* We are reactivating a filesystem after it was active */
+		if (log->l_dev == xlog_devt) {
+			xlog_devt = 1;
+			xlog_debug = 1;
+		}
+#endif
+	}
+
+	/*
+	 * Buffer size passed in from mount system call.
+	 */
+	if (mp->m_logbsize != -1) {
+		size = log->l_iclog_size = mp->m_logbsize;
+		log->l_iclog_size_log = 0;
+		while (size != 1) {
+			log->l_iclog_size_log++;
+			size >>= 1;
+		}
+
+		if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) {
+			/* # headers = size / 32K
+			 * one header holds cycles from 32K of data
+			 */
+
+			xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE;
+			if (mp->m_logbsize % XLOG_HEADER_CYCLE_SIZE)
+				xhdrs++;
+			log->l_iclog_hsize = xhdrs << BBSHIFT;
+			log->l_iclog_heads = xhdrs;
+		} else {
+			ASSERT(mp->m_logbsize <= XLOG_BIG_RECORD_BSIZE);
+			log->l_iclog_hsize = BBSIZE;
+			log->l_iclog_heads = 1;
+		}
+		return;
+	}
+
+	/*
+	 * Special case machines that have less than 32MB of memory.
+	 * All machines with more memory use 32KB buffers.
+	 */
+	if (xfs_physmem <= btoc(32*1024*1024)) {
+		/* Don't change; min configuration */
+		log->l_iclog_size = XLOG_RECORD_BSIZE;		/* 16k */
+		log->l_iclog_size_log = XLOG_RECORD_BSHIFT;
+	} else {
+		log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;	/* 32k */
+		log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
+	}
+
+	/* the default log size is 16k or 32k which is one header sector */
+	log->l_iclog_hsize = BBSIZE;
+	log->l_iclog_heads = 1;
+
+	/*
+	 * For 16KB, we use 3 32KB buffers.  For 32KB block sizes, we use
+	 * 4 32KB buffers.  For 64KB block sizes, we use 8 32KB buffers.
+	 */
+	if (mp->m_sb.sb_blocksize >= 16*1024) {
+		log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
+		log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
+		if (mp->m_logbufs == -1) {
+			switch (mp->m_sb.sb_blocksize) {
+			    case 16*1024:			/* 16 KB */
+				log->l_iclog_bufs = 3;
+				break;
+			    case 32*1024:			/* 32 KB */
+				log->l_iclog_bufs = 4;
+				break;
+			    case 64*1024:			/* 64 KB */
+				log->l_iclog_bufs = 8;
+				break;
+			    default:
+				xlog_panic("XFS: Illegal blocksize");
+				break;
+			}
+		}
+	}
+}	/* xlog_get_iclog_buffer_size */
+
+
+/*
+ * This routine initializes some of the log structure for a given mount point.
+ * Its primary purpose is to fill in enough, so recovery can occur.  However,
+ * some other stuff may be filled in too.
+ */
+STATIC xlog_t *
+xlog_alloc_log(xfs_mount_t	*mp,
+	       dev_t		log_dev,
+	       xfs_daddr_t	blk_offset,
+	       int		num_bblks)
+{
+	xlog_t			*log;
+	xlog_rec_header_t	*head;
+	xlog_in_core_t		**iclogp;
+	xlog_in_core_t		*iclog, *prev_iclog=NULL;
+	xfs_buf_t		*bp;
+	int			i;
+	int			iclogsize;
+
+	log = (void *)kmem_zalloc(sizeof(xlog_t), KM_SLEEP);
+
+	log->l_mp	   = mp;
+	log->l_dev	   = log_dev;
+	log->l_logsize	   = BBTOB(num_bblks);
+	log->l_logBBstart  = blk_offset;
+	log->l_logBBsize   = num_bblks;
+	log->l_roundoff	   = 0;
+	log->l_covered_state = XLOG_STATE_COVER_IDLE;
+	log->l_flags	   |= XLOG_ACTIVE_RECOVERY;
+
+	log->l_prev_block  = -1;
+	ASSIGN_ANY_LSN(log->l_tail_lsn, 1, 0, ARCH_NOCONVERT);
+	/* log->l_tail_lsn    = 0x100000000LL; cycle = 1; current block = 0 */
+	log->l_last_sync_lsn = log->l_tail_lsn;
+	log->l_curr_cycle  = 1;	    /* 0 is bad since this is initial value */
+	log->l_curr_block  = 0;		/* filled in by xlog_recover */
+	log->l_grant_reserve_bytes = 0;
+	log->l_grant_reserve_cycle = 1;
+	log->l_grant_write_bytes = 0;
+	log->l_grant_write_cycle = 1;
+	log->l_quotaoffs_flag = 0;	/* XFS_LI_QUOTAOFF logitems */
+
+	xlog_get_iclog_buffer_size(mp, log);
+
+	bp = log->l_xbuf   = XFS_getrbuf(0,mp); /* get my locked buffer */ /* mp needed for pagebuf/linux only */
+
+	XFS_BUF_SET_TARGET(bp, mp->m_logdev_targp);
+	XFS_BUF_SET_SIZE(bp, log->l_iclog_size);
+	XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
+	XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
+	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
+	ASSERT(XFS_BUF_ISBUSY(log->l_xbuf));
+	ASSERT(XFS_BUF_VALUSEMA(log->l_xbuf) <= 0);
+	spinlock_init(&log->l_icloglock, "iclog");
+	spinlock_init(&log->l_grant_lock, "grhead_iclog");
+	initnsema(&log->l_flushsema, 0, "ic-flush");
+	xlog_state_ticket_alloc(log);  /* wait until after icloglock inited */
+
+	/* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
+	ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
+
+	iclogp = &log->l_iclog;
+	/*
+	 * The amount of memory to allocate for the iclog structure is
+	 * rather funky due to the way the structure is defined.  It is
+	 * done this way so that we can use different sizes for machines
+	 * with different amounts of memory.  See the definition of
+	 * xlog_in_core_t in xfs_log_priv.h for details.
+	 */
+	iclogsize = log->l_iclog_size;
+	ASSERT(log->l_iclog_size >= 4096);
+	for (i=0; i < log->l_iclog_bufs; i++) {
+		*iclogp = (xlog_in_core_t *)
+			  kmem_zalloc(sizeof(xlog_in_core_t), KM_SLEEP);
+		iclog = *iclogp;
+		iclog->hic_data = (xlog_in_core_2_t *)
+			  kmem_alloc(iclogsize, KM_SLEEP);
+
+		iclog->ic_prev = prev_iclog;
+		prev_iclog = iclog;
+		log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
+
+		head = &iclog->ic_header;
+		memset(head, 0, sizeof(xlog_rec_header_t));
+		INT_SET(head->h_magicno, ARCH_CONVERT, XLOG_HEADER_MAGIC_NUM);
+		INT_SET(head->h_version, ARCH_CONVERT,
+			XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? 2 : 1);
+		INT_SET(head->h_size, ARCH_CONVERT, log->l_iclog_size);
+		/* new fields */
+		INT_SET(head->h_fmt, ARCH_CONVERT, XLOG_FMT);
+		memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
+
+		bp = iclog->ic_bp = XFS_getrbuf(0,mp);		/* my locked buffer */ /* mp need for pagebuf/linux only */
+		XFS_BUF_SET_TARGET(bp, mp->m_logdev_targp);
+		XFS_BUF_SET_SIZE(bp, log->l_iclog_size);
+		XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone);
+		XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
+		XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
+
+		iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize;
+		iclog->ic_state = XLOG_STATE_ACTIVE;
+		iclog->ic_log = log;
+		iclog->ic_callback_tail = &(iclog->ic_callback);
+		iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize;
+
+		ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
+		ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
+		sv_init(&iclog->ic_forcesema, SV_DEFAULT, "iclog-force");
+
+		iclogp = &iclog->ic_next;
+	}
+	*iclogp = log->l_iclog;			/* complete ring */
+	log->l_iclog->ic_prev = prev_iclog;	/* re-write 1st prev ptr */
+
+	return log;
+}	/* xlog_alloc_log */
+
+
+/*
+ * Write out the commit record of a transaction associated with the given
+ * ticket.  Return the lsn of the commit record.
+ */
+STATIC int
+xlog_commit_record(xfs_mount_t	*mp,
+		   xlog_ticket_t *ticket,
+		   xfs_lsn_t	*commitlsnp)
+{
+	int		error;
+	xfs_log_iovec_t reg[1];
+
+	reg[0].i_addr = 0;
+	reg[0].i_len = 0;
+
+	if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp,
+			       XLOG_COMMIT_TRANS))) {
+		xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
+	}
+	return (error);
+}	/* xlog_commit_record */
+
+
+/*
+ * Push on the buffer cache code if we ever use more than 75% of the on-disk
+ * log space.  This code pushes on the lsn which would supposedly free up
+ * the 25% which we want to leave free.	 We may need to adopt a policy which
+ * pushes on an lsn which is further along in the log once we reach the high
+ * water mark.	In this manner, we would be creating a low water mark.
+ */
+void
+xlog_grant_push_ail(xfs_mount_t *mp,
+		    int		need_bytes)
+{
+    xlog_t	*log = mp->m_log;	/* pointer to the log */
+    xfs_lsn_t	tail_lsn;		/* lsn of the log tail */
+    xfs_lsn_t	threshold_lsn = 0;	/* lsn we'd like to be at */
+    int		free_blocks;		/* free blocks left to write to */
+    int		free_bytes;		/* free bytes left to write to */
+    int		threshold_block;	/* block in lsn we'd like to be at */
+    int		threshold_cycle;	/* lsn cycle we'd like to be at */
+    int		free_threshold;
+    SPLDECL(s);
+
+    ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
+
+    s = GRANT_LOCK(log);
+    free_bytes = xlog_space_left(log,
+				 log->l_grant_reserve_cycle,
+				 log->l_grant_reserve_bytes);
+    tail_lsn = log->l_tail_lsn;
+    free_blocks = BTOBBT(free_bytes);
+
+    /*
+     * Set the threshold for the minimum number of free blocks in the
+     * log to the maximum of what the caller needs, one quarter of the
+     * log, and 256 blocks.
+     */
+    free_threshold = BTOBB(need_bytes);
+    free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2));
+    free_threshold = MAX(free_threshold, 256);
+    if (free_blocks < free_threshold) {
+	threshold_block = BLOCK_LSN(tail_lsn, ARCH_NOCONVERT) + free_threshold;
+	threshold_cycle = CYCLE_LSN(tail_lsn, ARCH_NOCONVERT);
+	if (threshold_block >= log->l_logBBsize) {
+	    threshold_block -= log->l_logBBsize;
+	    threshold_cycle += 1;
+	}
+	ASSIGN_ANY_LSN(threshold_lsn, threshold_cycle,
+		       threshold_block, ARCH_NOCONVERT);
+
+	/* Don't pass in an lsn greater than the lsn of the last
+	 * log record known to be on disk.
+	 */
+	if (XFS_LSN_CMP_ARCH(threshold_lsn, log->l_last_sync_lsn, ARCH_NOCONVERT) > 0)
+	    threshold_lsn = log->l_last_sync_lsn;
+    }
+    GRANT_UNLOCK(log, s);
+
+    /*
+     * Get the transaction layer to kick the dirty buffers out to
+     * disk asynchronously. No point in trying to do this if
+     * the filesystem is shutting down.
+     */
+    if (threshold_lsn &&
+	!XLOG_FORCED_SHUTDOWN(log))
+	    xfs_trans_push_ail(mp, threshold_lsn);
+}	/* xlog_grant_push_ail */
+
+
+/*
+ * Flush out the in-core log (iclog) to the on-disk log in a synchronous or
+ * asynchronous fashion.  Previously, we should have moved the current iclog
+ * ptr in the log to point to the next available iclog.	 This allows further
+ * write to continue while this code syncs out an iclog ready to go.
+ * Before an in-core log can be written out, the data section must be scanned
+ * to save away the 1st word of each BBSIZE block into the header.  We replace
+ * it with the current cycle count.  Each BBSIZE block is tagged with the
+ * cycle count because there in an implicit assumption that drives will
+ * guarantee that entire 512 byte blocks get written at once.  In other words,
+ * we can't have part of a 512 byte block written and part not written.	 By
+ * tagging each block, we will know which blocks are valid when recovering
+ * after an unclean shutdown.
+ *
+ * This routine is single threaded on the iclog.  No other thread can be in
+ * this routine with the same iclog.  Changing contents of iclog can there-
+ * fore be done without grabbing the state machine lock.  Updating the global
+ * log will require grabbing the lock though.
+ *
+ * The entire log manager uses a logical block numbering scheme.  Only
+ * log_sync (and then only bwrite()) know about the fact that the log may
+ * not start with block zero on a given device.	 The log block start offset
+ * is added immediately before calling bwrite().
+ */
+
+int
+xlog_sync(xlog_t		*log,
+	  xlog_in_core_t	*iclog,
+	  uint			flags)
+{
+	xfs_caddr_t	dptr;		/* pointer to byte sized element */
+	xfs_buf_t	*bp;
+	int		i;
+	uint		roundup;
+	uint		count;		/* byte count of bwrite */
+	int		split = 0;	/* split write into two regions */
+	int		error;
+
+	XFS_STATS_INC(xfsstats.xs_log_writes);
+	ASSERT(iclog->ic_refcnt == 0);
+
+#ifdef DEBUG
+	if (flags != 0 && (flags & XFS_LOG_SYNC) )
+		xlog_panic("xlog_sync: illegal flag");
+#endif
+
+	/* Round out the log write size */
+	if (iclog->ic_offset & BBMASK) {
+		/* count of 0 is already accounted for up in
+		 * xlog_state_sync_all().  Once in this routine,
+		 * operations on the iclog are single threaded.
+		 *
+		 * Difference between rounded up size and size
+		 */
+		count = iclog->ic_offset & BBMASK;
+		iclog->ic_roundoff += BBSIZE - count;
+	}
+	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+		unsigned sunit = BTOBB(log->l_mp->m_sb.sb_logsunit);
+		if (!sunit)
+			sunit = 1;
+
+		count = BTOBB(log->l_iclog_hsize + iclog->ic_offset);
+		if (count & (sunit - 1)) {
+			roundup = sunit - (count & (sunit - 1));
+		} else {
+			roundup = 0;
+		}
+		iclog->ic_offset += BBTOB(roundup);
+	}
+
+	log->l_roundoff += iclog->ic_roundoff;
+
+	xlog_pack_data(log, iclog);	  /* put cycle number in every block */
+	INT_SET(iclog->ic_header.h_len, ARCH_CONVERT, iclog->ic_offset);	/* real byte length */
+
+	bp	    = iclog->ic_bp;
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1);
+	XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2);
+	XFS_BUF_SET_ADDR(bp, BLOCK_LSN(iclog->ic_header.h_lsn, ARCH_CONVERT));
+
+	/* Count is already rounded up to a BBSIZE above */
+	count = iclog->ic_offset + iclog->ic_roundoff;
+	ASSERT((count & BBMASK) == 0);
+
+	/* Add for LR header */
+	count += log->l_iclog_hsize;
+	XFS_STATS_ADD(xfsstats.xs_log_blocks, BTOBB(count));
+
+	/* Do we need to split this write into 2 parts? */
+	if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
+		split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
+		count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
+		iclog->ic_bwritecnt = 2;	/* split into 2 writes */
+	} else {
+		iclog->ic_bwritecnt = 1;
+	}
+	XFS_BUF_SET_PTR(bp, (xfs_caddr_t) &(iclog->ic_header), count);
+	XFS_BUF_SET_FSPRIVATE(bp, iclog);	/* save for later */
+	if (flags & XFS_LOG_SYNC){
+		XFS_BUF_BUSY(bp);
+		XFS_BUF_HOLD(bp);
+	} else {
+		XFS_BUF_BUSY(bp);
+		XFS_BUF_ASYNC(bp);
+	}
+	/*
+	 * Do a disk write cache flush for the log block.
+	 * This is a bit of a sledgehammer, it would be better
+	 * to use a tag barrier here that just prevents reordering.
+	 * It may not be needed to flush the first split block in the log wrap
+	 * case, but do it anyways to be safe -AK
+	 */
+	if (!(log->l_mp->m_flags & XFS_MOUNT_NOLOGFLUSH))
+		XFS_BUF_FLUSH(bp);
+
+	ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
+	ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
+
+	xlog_verify_iclog(log, iclog, count, B_TRUE);
+
+	/* account for log which doesn't start at block #0 */
+	XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
+	/*
+	 * Don't call xfs_bwrite here. We do log-syncs even when the filesystem
+	 * is shutting down.
+	 */
+	XFS_BUF_WRITE(bp);
+
+	if ((error = XFS_bwrite(bp))) {
+		xfs_ioerror_alert("xlog_sync", log->l_mp, bp,
+				  XFS_BUF_ADDR(bp));
+		return (error);
+	}
+	if (split) {
+		bp		= iclog->ic_log->l_xbuf;
+		ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) ==
+							(unsigned long)1);
+		XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2);
+		XFS_BUF_SET_ADDR(bp, 0);	     /* logical 0 */
+		XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+
+					    (__psint_t)count), split);
+		XFS_BUF_SET_FSPRIVATE(bp, iclog);
+		XFS_BUF_BUSY(bp);
+		XFS_BUF_ASYNC(bp);
+		if (!(log->l_mp->m_flags & XFS_MOUNT_NOLOGFLUSH))
+			XFS_BUF_FLUSH(bp);
+		dptr = XFS_BUF_PTR(bp);
+		/*
+		 * Bump the cycle numbers at the start of each block
+		 * since this part of the buffer is at the start of
+		 * a new cycle.	 Watch out for the header magic number
+		 * case, though.
+		 */
+		for (i=0; i<split; i += BBSIZE) {
+			INT_MOD(*(uint *)dptr, ARCH_CONVERT, +1);
+			if (INT_GET(*(uint *)dptr, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM)
+				INT_MOD(*(uint *)dptr, ARCH_CONVERT, +1);
+			dptr += BBSIZE;
+		}
+
+		ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
+		ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
+
+		/* account for internal log which does't start at block #0 */
+		XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
+		XFS_BUF_WRITE(bp);
+		if ((error = XFS_bwrite(bp))) {
+			xfs_ioerror_alert("xlog_sync (split)", log->l_mp,
+					  bp, XFS_BUF_ADDR(bp));
+			return (error);
+		}
+	}
+	return (0);
+}	/* xlog_sync */
+
+
+/*
+ * Unallocate a log structure
+ */
+void
+xlog_unalloc_log(xlog_t *log)
+{
+	xlog_in_core_t	*iclog, *next_iclog;
+	xlog_ticket_t	*tic, *next_tic;
+	int		i;
+
+
+	iclog = log->l_iclog;
+	for (i=0; i<log->l_iclog_bufs; i++) {
+		sv_destroy(&iclog->ic_forcesema);
+		XFS_freerbuf(iclog->ic_bp);
+#ifdef DEBUG
+		if (iclog->ic_trace != NULL) {
+			ktrace_free(iclog->ic_trace);
+		}
+#endif
+		next_iclog = iclog->ic_next;
+		kmem_free(iclog->hic_data, log->l_iclog_size);
+		kmem_free(iclog, sizeof(xlog_in_core_t));
+		iclog = next_iclog;
+	}
+	freesema(&log->l_flushsema);
+	spinlock_destroy(&log->l_icloglock);
+	spinlock_destroy(&log->l_grant_lock);
+
+	/* XXXsup take a look at this again. */
+	if ((log->l_ticket_cnt != log->l_ticket_tcnt)  &&
+	    !XLOG_FORCED_SHUTDOWN(log)) {
+		xfs_fs_cmn_err(CE_WARN, log->l_mp,
+			"xlog_unalloc_log: (cnt: %d, total: %d)",
+			log->l_ticket_cnt, log->l_ticket_tcnt);
+		/* ASSERT(log->l_ticket_cnt == log->l_ticket_tcnt); */
+
+	} else {
+		tic = log->l_unmount_free;
+		while (tic) {
+			next_tic = tic->t_next;
+			kmem_free(tic, NBPP);
+			tic = next_tic;
+		}
+	}
+	XFS_freerbuf(log->l_xbuf);
+#ifdef DEBUG
+	if (log->l_trace != NULL) {
+		ktrace_free(log->l_trace);
+	}
+	if (log->l_grant_trace != NULL) {
+		ktrace_free(log->l_grant_trace);
+	}
+#endif
+	log->l_mp->m_log = NULL;
+	kmem_free(log, sizeof(xlog_t));
+}	/* xlog_unalloc_log */
+
+
+/*
+ * Write some region out to in-core log
+ *
+ * This will be called when writing externally provided regions or when
+ * writing out a commit record for a given transaction.
+ *
+ * General algorithm:
+ *	1. Find total length of this write.  This may include adding to the
+ *		lengths passed in.
+ *	2. Check whether we violate the tickets reservation.
+ *	3. While writing to this iclog
+ *	    A. Reserve as much space in this iclog as can get
+ *	    B. If this is first write, save away start lsn
+ *	    C. While writing this region:
+ *		1. If first write of transaction, write start record
+ *		2. Write log operation header (header per region)
+ *		3. Find out if we can fit entire region into this iclog
+ *		4. Potentially, verify destination bcopy ptr
+ *		5. Bcopy (partial) region
+ *		6. If partial copy, release iclog; otherwise, continue
+ *			copying more regions into current iclog
+ *	4. Mark want sync bit (in simulation mode)
+ *	5. Release iclog for potential flush to on-disk log.
+ *
+ * ERRORS:
+ * 1.	Panic if reservation is overrun.  This should never happen since
+ *	reservation amounts are generated internal to the filesystem.
+ * NOTES:
+ * 1. Tickets are single threaded data structures.
+ * 2. The XLOG_END_TRANS & XLOG_CONTINUE_TRANS flags are passed down to the
+ *	syncing routine.  When a single log_write region needs to span
+ *	multiple in-core logs, the XLOG_CONTINUE_TRANS bit should be set
+ *	on all log operation writes which don't contain the end of the
+ *	region.	 The XLOG_END_TRANS bit is used for the in-core log
+ *	operation which contains the end of the continued log_write region.
+ * 3. When xlog_state_get_iclog_space() grabs the rest of the current iclog,
+ *	we don't really know exactly how much space will be used.  As a result,
+ *	we don't update ic_offset until the end when we know exactly how many
+ *	bytes have been written out.
+ */
+int
+xlog_write(xfs_mount_t *	mp,
+	   xfs_log_iovec_t	reg[],
+	   int			nentries,
+	   xfs_log_ticket_t	tic,
+	   xfs_lsn_t		*start_lsn,
+	   uint			flags)
+{
+    xlog_t	     *log    = mp->m_log;
+    xlog_ticket_t    *ticket = (xlog_ticket_t *)tic;
+    xlog_op_header_t *logop_head;    /* ptr to log operation header */
+    xlog_in_core_t   *iclog;	     /* ptr to current in-core log */
+    __psint_t	     ptr;	     /* copy address into data region */
+    int		     len;	     /* # xlog_write() bytes 2 still copy */
+    int		     index;	     /* region index currently copying */
+    int		     log_offset;     /* offset (from 0) into data region */
+    int		     start_rec_copy; /* # bytes to copy for start record */
+    int		     partial_copy;   /* did we split a region? */
+    int		     partial_copy_len;/* # bytes copied if split region */
+    int		     need_copy;	     /* # bytes need to bcopy this region */
+    int		     copy_len;	     /* # bytes actually bcopy'ing */
+    int		     copy_off;	     /* # bytes from entry start */
+    int		     contwr;	     /* continued write of in-core log? */
+    int		     firstwr = 0;    /* first write of transaction */
+    int		     error;
+
+    partial_copy_len = partial_copy = 0;
+
+    /* Calculate potential maximum space.  Each region gets its own
+     * xlog_op_header_t and may need to be double word aligned.
+     */
+    len = 0;
+    if (ticket->t_flags & XLOG_TIC_INITED)     /* acct for start rec of xact */
+	len += sizeof(xlog_op_header_t);
+
+    for (index = 0; index < nentries; index++) {
+	len += sizeof(xlog_op_header_t);	    /* each region gets >= 1 */
+	len += reg[index].i_len;
+    }
+    contwr = *start_lsn = 0;
+
+    if (ticket->t_curr_res < len) {
+#ifdef DEBUG
+	xlog_panic(
+		"xfs_log_write: reservation ran out. Need to up reservation");
+#else
+	/* Customer configurable panic */
+	xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
+		"xfs_log_write: reservation ran out. Need to up reservation");
+	/* If we did not panic, shutdown the filesystem */
+	xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+#endif
+    } else
+	ticket->t_curr_res -= len;
+
+    for (index = 0; index < nentries; ) {
+	if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+					       &contwr, &log_offset)))
+		return (error);
+
+	ASSERT(log_offset <= iclog->ic_size - 1);
+	ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset);
+
+	/* start_lsn is the first lsn written to. That's all we need. */
+	if (! *start_lsn)
+	    *start_lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT);
+
+	/* This loop writes out as many regions as can fit in the amount
+	 * of space which was allocated by xlog_state_get_iclog_space().
+	 */
+	while (index < nentries) {
+	    ASSERT(reg[index].i_len % sizeof(__int32_t) == 0);
+	    ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0);
+	    start_rec_copy = 0;
+
+	    /* If first write for transaction, insert start record.
+	     * We can't be trying to commit if we are inited.  We can't
+	     * have any "partial_copy" if we are inited.
+	     */
+	    if (ticket->t_flags & XLOG_TIC_INITED) {
+		logop_head		= (xlog_op_header_t *)ptr;
+		INT_SET(logop_head->oh_tid, ARCH_CONVERT, ticket->t_tid);
+		logop_head->oh_clientid = ticket->t_clientid;
+		INT_ZERO(logop_head->oh_len, ARCH_CONVERT);
+		logop_head->oh_flags	= XLOG_START_TRANS;
+		INT_ZERO(logop_head->oh_res2, ARCH_CONVERT);
+		ticket->t_flags		&= ~XLOG_TIC_INITED;	/* clear bit */
+		firstwr++;			  /* increment log ops below */
+
+		start_rec_copy = sizeof(xlog_op_header_t);
+		xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy);
+	    }
+
+	    /* Copy log operation header directly into data section */
+	    logop_head			= (xlog_op_header_t *)ptr;
+	    INT_SET(logop_head->oh_tid, ARCH_CONVERT, ticket->t_tid);
+	    logop_head->oh_clientid	= ticket->t_clientid;
+	    INT_ZERO(logop_head->oh_res2, ARCH_CONVERT);
+
+	    /* header copied directly */
+	    xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t));
+
+	    /* are we copying a commit or unmount record? */
+	    logop_head->oh_flags = flags;
+
+	    /*
+	     * We've seen logs corrupted with bad transaction client
+	     * ids.  This makes sure that XFS doesn't generate them on.
+	     * Turn this into an EIO and shut down the filesystem.
+	     */
+	    switch (logop_head->oh_clientid)  {
+	    case XFS_TRANSACTION:
+	    case XFS_VOLUME:
+	    case XFS_LOG:
+		break;
+	    default:
+		xfs_fs_cmn_err(CE_WARN, mp,
+		    "Bad XFS transaction clientid 0x%x in ticket 0x%p",
+		    logop_head->oh_clientid, tic);
+		return XFS_ERROR(EIO);
+	    }
+
+	    /* Partial write last time? => (partial_copy != 0)
+	     * need_copy is the amount we'd like to copy if everything could
+	     * fit in the current bcopy.
+	     */
+	    need_copy = reg[index].i_len - partial_copy_len;
+
+	    copy_off = partial_copy_len;
+	    if (need_copy <= iclog->ic_size - log_offset) { /*complete write */
+		INT_SET(logop_head->oh_len, ARCH_CONVERT, copy_len = need_copy);
+		if (partial_copy)
+		    logop_head->oh_flags|= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
+		partial_copy_len = partial_copy = 0;
+	    } else {					    /* partial write */
+		copy_len = iclog->ic_size - log_offset;
+		INT_SET(logop_head->oh_len, ARCH_CONVERT, copy_len);
+		logop_head->oh_flags |= XLOG_CONTINUE_TRANS;
+		if (partial_copy)
+			logop_head->oh_flags |= XLOG_WAS_CONT_TRANS;
+		partial_copy_len += copy_len;
+		partial_copy++;
+		len += sizeof(xlog_op_header_t); /* from splitting of region */
+		/* account for new log op header */
+		ticket->t_curr_res -= sizeof(xlog_op_header_t);
+	    }
+	    xlog_verify_dest_ptr(log, ptr);
+
+	    /* copy region */
+	    ASSERT(copy_len >= 0);
+	    bcopy(reg[index].i_addr + copy_off, (xfs_caddr_t)ptr, copy_len);
+	    xlog_write_adv_cnt(ptr, len, log_offset, copy_len);
+
+	    /* make copy_len total bytes copied, including headers */
+	    copy_len += start_rec_copy + sizeof(xlog_op_header_t);
+	    xlog_state_finish_copy(log, iclog, firstwr, (contwr? copy_len : 0));
+	    firstwr = 0;
+	    if (partial_copy) {			/* copied partial region */
+		    /* already marked WANT_SYNC by xlog_state_get_iclog_space */
+		    if ((error = xlog_state_release_iclog(log, iclog)))
+			    return (error);
+		    break;			/* don't increment index */
+	    } else {				/* copied entire region */
+		index++;
+		partial_copy_len = partial_copy = 0;
+
+		if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
+		    xlog_state_want_sync(log, iclog);
+		    if ((error = xlog_state_release_iclog(log, iclog)))
+			   return (error);
+		    if (index == nentries)
+			    return 0;		/* we are done */
+		    else
+			    break;
+		}
+	    } /* if (partial_copy) */
+	} /* while (index < nentries) */
+    } /* for (index = 0; index < nentries; ) */
+    ASSERT(len == 0);
+
+    return (xlog_state_release_iclog(log, iclog));
+}	/* xlog_write */
+
+
+/*****************************************************************************
+ *
+ *		State Machine functions
+ *
+ *****************************************************************************
+ */
+
+/* Clean iclogs starting from the head.	 This ordering must be
+ * maintained, so an iclog doesn't become ACTIVE beyond one that
+ * is SYNCING.	This is also required to maintain the notion that we use
+ * a counting semaphore to hold off would be writers to the log when every
+ * iclog is trying to sync to disk.
+ *
+ * State Change: DIRTY -> ACTIVE
+ */
+void
+xlog_state_clean_log(xlog_t *log)
+{
+	xlog_in_core_t	*iclog;
+	int changed = 0;
+
+	iclog = log->l_iclog;
+	do {
+		if (iclog->ic_state == XLOG_STATE_DIRTY) {
+			iclog->ic_state = XLOG_STATE_ACTIVE;
+			iclog->ic_offset       = 0;
+			iclog->ic_callback	= 0;   /* don't need to free */
+			/*
+			 * If the number of ops in this iclog indicate it just
+			 * contains the dummy transaction, we can
+			 * change state into IDLE (the second time around).
+			 * Otherwise we should change the state into NEED a dummy.
+			 * We don't need to cover the dummy.
+			 */
+			if (!changed &&
+			   (INT_GET(iclog->ic_header.h_num_logops, ARCH_CONVERT) == XLOG_COVER_OPS)) {
+				changed = 1;
+			} else {	/* we have two dirty iclogs so start over */
+					/* This could also be num of ops indicates
+						this is not the dummy going out. */
+				changed = 2;
+			}
+			INT_ZERO(iclog->ic_header.h_num_logops, ARCH_CONVERT);
+			bzero(iclog->ic_header.h_cycle_data,
+			      sizeof(iclog->ic_header.h_cycle_data));
+			INT_ZERO(iclog->ic_header.h_lsn, ARCH_CONVERT);
+		} else if (iclog->ic_state == XLOG_STATE_ACTIVE)
+			/* do nothing */;
+		else
+			break;	/* stop cleaning */
+		iclog = iclog->ic_next;
+	} while (iclog != log->l_iclog);
+
+	/* log is locked when we are called */
+	/*
+	 * Change state for the dummy log recording.
+	 * We usually go to NEED. But we go to NEED2 if the changed indicates
+	 * we are done writing the dummy record.
+	 * If we are done with the second dummy recored (DONE2), then
+	 * we go to IDLE.
+	 */
+	if (changed) {
+		switch (log->l_covered_state) {
+		case XLOG_STATE_COVER_IDLE:
+		case XLOG_STATE_COVER_NEED:
+		case XLOG_STATE_COVER_NEED2:
+			log->l_covered_state = XLOG_STATE_COVER_NEED;
+			break;
+
+		case XLOG_STATE_COVER_DONE:
+			if (changed == 1)
+				log->l_covered_state = XLOG_STATE_COVER_NEED2;
+			else
+				log->l_covered_state = XLOG_STATE_COVER_NEED;
+			break;
+
+		case XLOG_STATE_COVER_DONE2:
+			if (changed == 1)
+				log->l_covered_state = XLOG_STATE_COVER_IDLE;
+			else
+				log->l_covered_state = XLOG_STATE_COVER_NEED;
+			break;
+
+		default:
+			ASSERT(0);
+		}
+	}
+}	/* xlog_state_clean_log */
+
+STATIC xfs_lsn_t
+xlog_get_lowest_lsn(
+	xlog_t		*log)
+{
+	xlog_in_core_t	*lsn_log;
+	xfs_lsn_t	lowest_lsn, lsn;
+
+	lsn_log = log->l_iclog;
+	lowest_lsn = 0;
+	do {
+	    if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY))) {
+		lsn = INT_GET(lsn_log->ic_header.h_lsn, ARCH_CONVERT);
+		if ((lsn && !lowest_lsn) ||
+		    (XFS_LSN_CMP_ARCH(lsn, lowest_lsn, ARCH_NOCONVERT) < 0)) {
+			lowest_lsn = lsn;
+		}
+	    }
+	    lsn_log = lsn_log->ic_next;
+	} while (lsn_log != log->l_iclog);
+	return(lowest_lsn);
+}
+
+
+STATIC void
+xlog_state_do_callback(
+	xlog_t		*log,
+	int		aborted,
+	xlog_in_core_t	*ciclog)
+{
+	xlog_in_core_t	   *iclog;
+	xlog_in_core_t	   *first_iclog;	/* used to know when we've
+						 * processed all iclogs once */
+	xfs_log_callback_t *cb, *cb_next;
+	int		   flushcnt = 0;
+	xfs_lsn_t	   lowest_lsn;
+	int		   ioerrors;	/* counter: iclogs with errors */
+	int		   loopdidcallbacks; /* flag: inner loop did callbacks*/
+	int		   funcdidcallbacks; /* flag: function did callbacks */
+	int		   repeats;	/* for issuing console warnings if
+					 * looping too many times */
+	SPLDECL(s);
+
+	s = LOG_LOCK(log);
+	first_iclog = iclog = log->l_iclog;
+	ioerrors = 0;
+	funcdidcallbacks = 0;
+	repeats = 0;
+
+	do {
+		/*
+		 * Scan all iclogs starting with the one pointed to by the
+		 * log.	 Reset this starting point each time the log is
+		 * unlocked (during callbacks).
+		 *
+		 * Keep looping through iclogs until one full pass is made
+		 * without running any callbacks.
+		 */
+		first_iclog = log->l_iclog;
+		iclog = log->l_iclog;
+		loopdidcallbacks = 0;
+		repeats++;
+
+		do {
+
+			/* skip all iclogs in the ACTIVE & DIRTY states */
+			if (iclog->ic_state &
+			    (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY)) {
+				iclog = iclog->ic_next;
+				continue;
+			}
+
+			/*
+			 * Between marking a filesystem SHUTDOWN and stopping
+			 * the log, we do flush all iclogs to disk (if there
+			 * wasn't a log I/O error). So, we do want things to
+			 * go smoothly in case of just a SHUTDOWN  w/o a
+			 * LOG_IO_ERROR.
+			 */
+			if (!(iclog->ic_state & XLOG_STATE_IOERROR)) {
+				/*
+				 * Can only perform callbacks in order.	 Since
+				 * this iclog is not in the DONE_SYNC/
+				 * DO_CALLBACK state, we skip the rest and
+				 * just try to clean up.  If we set our iclog
+				 * to DO_CALLBACK, we will not process it when
+				 * we retry since a previous iclog is in the
+				 * CALLBACK and the state cannot change since
+				 * we are holding the LOG_LOCK.
+				 */
+				if (!(iclog->ic_state &
+					(XLOG_STATE_DONE_SYNC |
+						 XLOG_STATE_DO_CALLBACK))) {
+					if (ciclog && (ciclog->ic_state ==
+							XLOG_STATE_DONE_SYNC)) {
+						ciclog->ic_state = XLOG_STATE_DO_CALLBACK;
+					}
+					break;
+				}
+				/*
+				 * We now have an iclog that is in either the
+				 * DO_CALLBACK or DONE_SYNC states. The other
+				 * states (WANT_SYNC, SYNCING, or CALLBACK were
+				 * caught by the above if and are going to
+				 * clean (i.e. we aren't doing their callbacks)
+				 * see the above if.
+				 */
+
+				/*
+				 * We will do one more check here to see if we
+				 * have chased our tail around.
+				 */
+
+				lowest_lsn = xlog_get_lowest_lsn(log);
+				if (lowest_lsn && (
+					XFS_LSN_CMP_ARCH(
+						lowest_lsn,
+						INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT),
+						ARCH_NOCONVERT
+					)<0)) {
+					iclog = iclog->ic_next;
+					continue; /* Leave this iclog for
+						   * another thread */
+				}
+
+				iclog->ic_state = XLOG_STATE_CALLBACK;
+
+				LOG_UNLOCK(log, s);
+
+				/* l_last_sync_lsn field protected by
+				 * GRANT_LOCK. Don't worry about iclog's lsn.
+				 * No one else can be here except us.
+				 */
+				s = GRANT_LOCK(log);
+				ASSERT(XFS_LSN_CMP_ARCH(
+						log->l_last_sync_lsn,
+						INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT),
+						ARCH_NOCONVERT
+					)<=0);
+				log->l_last_sync_lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT);
+				GRANT_UNLOCK(log, s);
+
+				/*
+				 * Keep processing entries in the callback list
+				 * until we come around and it is empty.  We
+				 * need to atomically see that the list is
+				 * empty and change the state to DIRTY so that
+				 * we don't miss any more callbacks being added.
+				 */
+				s = LOG_LOCK(log);
+			} else {
+				ioerrors++;
+			}
+			cb = iclog->ic_callback;
+
+			while (cb != 0) {
+				iclog->ic_callback_tail = &(iclog->ic_callback);
+				iclog->ic_callback = 0;
+				LOG_UNLOCK(log, s);
+
+				/* perform callbacks in the order given */
+				for (; cb != 0; cb = cb_next) {
+					cb_next = cb->cb_next;
+					cb->cb_func(cb->cb_arg, aborted);
+				}
+				s = LOG_LOCK(log);
+				cb = iclog->ic_callback;
+			}
+
+			loopdidcallbacks++;
+			funcdidcallbacks++;
+
+			ASSERT(iclog->ic_callback == 0);
+			if (!(iclog->ic_state & XLOG_STATE_IOERROR))
+				iclog->ic_state = XLOG_STATE_DIRTY;
+
+			/* wake up threads waiting in xfs_log_force() */
+			sv_broadcast(&iclog->ic_forcesema);
+
+			iclog = iclog->ic_next;
+		} while (first_iclog != iclog);
+		if (repeats && (repeats % 10) == 0) {
+			xfs_fs_cmn_err(CE_WARN, log->l_mp,
+				"xlog_state_do_callback: looping %d\n", repeats);
+		}
+	} while (!ioerrors && loopdidcallbacks);
+
+	/*
+	 * make one last gasp attempt to see if iclogs are being left in
+	 * limbo..
+	 */
+#ifdef DEBUG
+	if (funcdidcallbacks) {
+		first_iclog = iclog = log->l_iclog;
+		do {
+			ASSERT(iclog->ic_state != XLOG_STATE_DO_CALLBACK);
+			/*
+			 * Terminate the loop if iclogs are found in states
+			 * which will cause other threads to clean up iclogs.
+			 *
+			 * SYNCING - i/o completion will go through logs
+			 * DONE_SYNC - interrupt thread should be waiting for
+			 *		LOG_LOCK
+			 * IOERROR - give up hope all ye who enter here
+			 */
+			if (iclog->ic_state == XLOG_STATE_SYNCING ||
+			    iclog->ic_state == XLOG_STATE_DONE_SYNC ||
+			    iclog->ic_state == XLOG_STATE_IOERROR )
+				break;
+			iclog = iclog->ic_next;
+		} while (first_iclog != iclog);
+	}
+#endif
+
+	/*
+	 * Transition from DIRTY to ACTIVE if applicable. NOP if
+	 * STATE_IOERROR.
+	 */
+	xlog_state_clean_log(log);
+
+	if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) {
+		flushcnt = log->l_flushcnt;
+		log->l_flushcnt = 0;
+	}
+	LOG_UNLOCK(log, s);
+	while (flushcnt--)
+		vsema(&log->l_flushsema);
+}	/* xlog_state_do_callback */
+
+
+/*
+ * Finish transitioning this iclog to the dirty state.
+ *
+ * Make sure that we completely execute this routine only when this is
+ * the last call to the iclog.	There is a good chance that iclog flushes,
+ * when we reach the end of the physical log, get turned into 2 separate
+ * calls to bwrite.  Hence, one iclog flush could generate two calls to this
+ * routine.  By using the reference count bwritecnt, we guarantee that only
+ * the second completion goes through.
+ *
+ * Callbacks could take time, so they are done outside the scope of the
+ * global state machine log lock.  Assume that the calls to cvsema won't
+ * take a long time.  At least we know it won't sleep.
+ */
+void
+xlog_state_done_syncing(
+	xlog_in_core_t	*iclog,
+	int		aborted)
+{
+	xlog_t		   *log = iclog->ic_log;
+	SPLDECL(s);
+
+	s = LOG_LOCK(log);
+
+	ASSERT(iclog->ic_state == XLOG_STATE_SYNCING ||
+	       iclog->ic_state == XLOG_STATE_IOERROR);
+	ASSERT(iclog->ic_refcnt == 0);
+	ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2);
+
+
+	/*
+	 * If we got an error, either on the first buffer, or in the case of
+	 * split log writes, on the second, we mark ALL iclogs STATE_IOERROR,
+	 * and none should ever be attempted to be written to disk
+	 * again.
+	 */
+	if (iclog->ic_state != XLOG_STATE_IOERROR) {
+		if (--iclog->ic_bwritecnt == 1) {
+			LOG_UNLOCK(log, s);
+			return;
+		}
+		iclog->ic_state = XLOG_STATE_DONE_SYNC;
+	}
+
+	/*
+	 * Someone could be sleeping on the next iclog even though it is
+	 * in the ACTIVE state.	 We kick off one thread to force the
+	 * iclog buffer out.
+	 */
+	if (iclog->ic_next->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR))
+		sv_signal(&iclog->ic_next->ic_forcesema);
+	LOG_UNLOCK(log, s);
+	xlog_state_do_callback(log, aborted, iclog);	/* also cleans log */
+}	/* xlog_state_done_syncing */
+
+
+/*
+ * Update counters atomically now that bcopy is done.
+ */
+/* ARGSUSED */
+static inline void
+xlog_state_finish_copy(xlog_t		*log,
+		       xlog_in_core_t	*iclog,
+		       int		first_write,
+		       int		copy_bytes)
+{
+	SPLDECL(s);
+
+	s = LOG_LOCK(log);
+
+	if (first_write)
+		INT_MOD(iclog->ic_header.h_num_logops, ARCH_CONVERT, +1);
+	INT_MOD(iclog->ic_header.h_num_logops, ARCH_CONVERT, +1);
+	iclog->ic_offset += copy_bytes;
+
+	LOG_UNLOCK(log, s);
+}	/* xlog_state_finish_copy */
+
+
+
+/*
+ * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
+ * sleep.  The flush semaphore is set to the number of in-core buffers and
+ * decremented around disk syncing.  Therefore, if all buffers are syncing,
+ * this semaphore will cause new writes to sleep until a sync completes.
+ * Otherwise, this code just does p() followed by v().	This approximates
+ * a sleep/wakeup except we can't race.
+ *
+ * The in-core logs are used in a circular fashion. They are not used
+ * out-of-order even when an iclog past the head is free.
+ *
+ * return:
+ *	* log_offset where xlog_write() can start writing into the in-core
+ *		log's data space.
+ *	* in-core log pointer to which xlog_write() should write.
+ *	* boolean indicating this is a continued write to an in-core log.
+ *		If this is the last write, then the in-core log's offset field
+ *		needs to be incremented, depending on the amount of data which
+ *		is copied.
+ */
+int
+xlog_state_get_iclog_space(xlog_t	  *log,
+			   int		  len,
+			   xlog_in_core_t **iclogp,
+			   xlog_ticket_t  *ticket,
+			   int		  *continued_write,
+			   int		  *logoffsetp)
+{
+	SPLDECL(s);
+	int		  log_offset;
+	xlog_rec_header_t *head;
+	xlog_in_core_t	  *iclog;
+	int		  error;
+
+	xlog_state_do_callback(log, 0, NULL);	/* also cleans log */
+
+restart:
+	s = LOG_LOCK(log);
+	if (XLOG_FORCED_SHUTDOWN(log)) {
+		LOG_UNLOCK(log, s);
+		return XFS_ERROR(EIO);
+	}
+
+	iclog = log->l_iclog;
+	if (! (iclog->ic_state == XLOG_STATE_ACTIVE)) {
+		log->l_flushcnt++;
+		LOG_UNLOCK(log, s);
+		xlog_trace_iclog(iclog, XLOG_TRACE_SLEEP_FLUSH);
+		XFS_STATS_INC(xfsstats.xs_log_noiclogs);
+		/* Ensure that log writes happen */
+		psema(&log->l_flushsema, PINOD);
+		goto restart;
+	}
+	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+	head = &iclog->ic_header;
+
+	iclog->ic_refcnt++;			/* prevents sync */
+	log_offset = iclog->ic_offset;
+
+	/* On the 1st write to an iclog, figure out lsn.  This works
+	 * if iclogs marked XLOG_STATE_WANT_SYNC always write out what they are
+	 * committing to.  If the offset is set, that's how many blocks
+	 * must be written.
+	 */
+	if (log_offset == 0) {
+		ticket->t_curr_res -= log->l_iclog_hsize;
+		INT_SET(head->h_cycle, ARCH_CONVERT, log->l_curr_cycle);
+		ASSIGN_LSN(head->h_lsn, log, ARCH_CONVERT);
+		ASSERT(log->l_curr_block >= 0);
+
+		/* round off error from last write with this iclog */
+		ticket->t_curr_res -= iclog->ic_roundoff;
+		log->l_roundoff -= iclog->ic_roundoff;
+		iclog->ic_roundoff = 0;
+	}
+
+	/* If there is enough room to write everything, then do it.  Otherwise,
+	 * claim the rest of the region and make sure the XLOG_STATE_WANT_SYNC
+	 * bit is on, so this will get flushed out.  Don't update ic_offset
+	 * until you know exactly how many bytes get copied.  Therefore, wait
+	 * until later to update ic_offset.
+	 *
+	 * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's
+	 * can fit into remaining data section.
+	 */
+	if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
+		xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
+
+		/* If I'm the only one writing to this iclog, sync it to disk */
+		if (iclog->ic_refcnt == 1) {
+			LOG_UNLOCK(log, s);
+			if ((error = xlog_state_release_iclog(log, iclog)))
+				return (error);
+		} else {
+			iclog->ic_refcnt--;
+			LOG_UNLOCK(log, s);
+		}
+		goto restart;
+	}
+
+	/* Do we have enough room to write the full amount in the remainder
+	 * of this iclog?  Or must we continue a write on the next iclog and
+	 * mark this iclog as completely taken?	 In the case where we switch
+	 * iclogs (to mark it taken), this particular iclog will release/sync
+	 * to disk in xlog_write().
+	 */
+	if (len <= iclog->ic_size - iclog->ic_offset) {
+		*continued_write = 0;
+		iclog->ic_offset += len;
+	} else {
+		*continued_write = 1;
+		xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
+	}
+	*iclogp = iclog;
+
+	ASSERT(iclog->ic_offset <= iclog->ic_size);
+	LOG_UNLOCK(log, s);
+
+	*logoffsetp = log_offset;
+	return 0;
+}	/* xlog_state_get_iclog_space */
+
+/*
+ * Atomically get the log space required for a log ticket.
+ *
+ * Once a ticket gets put onto the reserveq, it will only return after
+ * the needed reservation is satisfied.
+ */
+STATIC int
+xlog_grant_log_space(xlog_t	   *log,
+		     xlog_ticket_t *tic)
+{
+	int		 free_bytes;
+	int		 need_bytes;
+	SPLDECL(s);
+#ifdef DEBUG
+	xfs_lsn_t	 tail_lsn;
+#endif
+
+
+#ifdef DEBUG
+	if (log->l_flags & XLOG_ACTIVE_RECOVERY)
+		panic("grant Recovery problem");
+#endif
+
+	/* Is there space or do we need to sleep? */
+	s = GRANT_LOCK(log);
+	xlog_trace_loggrant(log, tic, "xlog_grant_log_space: enter");
+
+	/* something is already sleeping; insert new transaction at end */
+	if (log->l_reserve_headq) {
+		XLOG_INS_TICKETQ(log->l_reserve_headq, tic);
+		xlog_trace_loggrant(log, tic,
+				    "xlog_grant_log_space: sleep 1");
+		/*
+		 * Gotta check this before going to sleep, while we're
+		 * holding the grant lock.
+		 */
+		if (XLOG_FORCED_SHUTDOWN(log))
+			goto error_return;
+
+		XFS_STATS_INC(xfsstats.xs_sleep_logspace);
+		sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+		/*
+		 * If we got an error, and the filesystem is shutting down,
+		 * we'll catch it down below. So just continue...
+		 */
+		xlog_trace_loggrant(log, tic,
+				    "xlog_grant_log_space: wake 1");
+		s = GRANT_LOCK(log);
+	}
+	if (tic->t_flags & XFS_LOG_PERM_RESERV)
+		need_bytes = tic->t_unit_res*tic->t_ocnt;
+	else
+		need_bytes = tic->t_unit_res;
+
+redo:
+	if (XLOG_FORCED_SHUTDOWN(log))
+		goto error_return;
+
+	free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle,
+				     log->l_grant_reserve_bytes);
+	if (free_bytes < need_bytes) {
+		if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+			XLOG_INS_TICKETQ(log->l_reserve_headq, tic);
+		xlog_trace_loggrant(log, tic,
+				    "xlog_grant_log_space: sleep 2");
+		XFS_STATS_INC(xfsstats.xs_sleep_logspace);
+		sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+
+		if (XLOG_FORCED_SHUTDOWN(log)) {
+			s = GRANT_LOCK(log);
+			goto error_return;
+		}
+
+		xlog_trace_loggrant(log, tic,
+				    "xlog_grant_log_space: wake 2");
+		xlog_grant_push_ail(log->l_mp, need_bytes);
+		s = GRANT_LOCK(log);
+		goto redo;
+	} else if (tic->t_flags & XLOG_TIC_IN_Q)
+		XLOG_DEL_TICKETQ(log->l_reserve_headq, tic);
+
+	/* we've got enough space */
+	XLOG_GRANT_ADD_SPACE(log, need_bytes, 'w');
+	XLOG_GRANT_ADD_SPACE(log, need_bytes, 'r');
+#ifdef DEBUG
+	tail_lsn = log->l_tail_lsn;
+	/*
+	 * Check to make sure the grant write head didn't just over lap the
+	 * tail.  If the cycles are the same, we can't be overlapping.
+	 * Otherwise, make sure that the cycles differ by exactly one and
+	 * check the byte count.
+	 */
+	if (CYCLE_LSN(tail_lsn, ARCH_NOCONVERT) != log->l_grant_write_cycle) {
+		ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn, ARCH_NOCONVERT));
+		ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn, ARCH_NOCONVERT)));
+	}
+#endif
+	xlog_trace_loggrant(log, tic, "xlog_grant_log_space: exit");
+	xlog_verify_grant_head(log, 1);
+	GRANT_UNLOCK(log, s);
+	return 0;
+
+ error_return:
+	if (tic->t_flags & XLOG_TIC_IN_Q)
+		XLOG_DEL_TICKETQ(log->l_reserve_headq, tic);
+	xlog_trace_loggrant(log, tic, "xlog_grant_log_space: err_ret");
+	/*
+	 * If we are failing, make sure the ticket doesn't have any
+	 * current reservations. We don't want to add this back when
+	 * the ticket/transaction gets cancelled.
+	 */
+	tic->t_curr_res = 0;
+	tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
+	GRANT_UNLOCK(log, s);
+	return XFS_ERROR(EIO);
+}	/* xlog_grant_log_space */
+
+
+/*
+ * Replenish the byte reservation required by moving the grant write head.
+ *
+ *
+ */
+STATIC int
+xlog_regrant_write_log_space(xlog_t	   *log,
+			     xlog_ticket_t *tic)
+{
+	SPLDECL(s);
+	int		free_bytes, need_bytes;
+	xlog_ticket_t	*ntic;
+#ifdef DEBUG
+	xfs_lsn_t	tail_lsn;
+#endif
+
+	tic->t_curr_res = tic->t_unit_res;
+
+	if (tic->t_cnt > 0)
+		return (0);
+
+#ifdef DEBUG
+	if (log->l_flags & XLOG_ACTIVE_RECOVERY)
+		panic("regrant Recovery problem");
+#endif
+
+	s = GRANT_LOCK(log);
+	xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: enter");
+
+	if (XLOG_FORCED_SHUTDOWN(log))
+		goto error_return;
+
+	/* If there are other waiters on the queue then give them a
+	 * chance at logspace before us. Wake up the first waiters,
+	 * if we do not wake up all the waiters then go to sleep waiting
+	 * for more free space, otherwise try to get some space for
+	 * this transaction.
+	 */
+
+	if ((ntic = log->l_write_headq)) {
+		free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
+					     log->l_grant_write_bytes);
+		do {
+			ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
+
+			if (free_bytes < ntic->t_unit_res)
+				break;
+			free_bytes -= ntic->t_unit_res;
+			sv_signal(&ntic->t_sema);
+			ntic = ntic->t_next;
+		} while (ntic != log->l_write_headq);
+
+		if (ntic != log->l_write_headq) {
+			if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+				XLOG_INS_TICKETQ(log->l_write_headq, tic);
+
+			xlog_trace_loggrant(log, tic,
+				    "xlog_regrant_write_log_space: sleep 1");
+			XFS_STATS_INC(xfsstats.xs_sleep_logspace);
+			sv_wait(&tic->t_sema, PINOD|PLTWAIT,
+				&log->l_grant_lock, s);
+
+			/* If we're shutting down, this tic is already
+			 * off the queue */
+			if (XLOG_FORCED_SHUTDOWN(log)) {
+				s = GRANT_LOCK(log);
+				goto error_return;
+			}
+
+			xlog_trace_loggrant(log, tic,
+				    "xlog_regrant_write_log_space: wake 1");
+			xlog_grant_push_ail(log->l_mp, tic->t_unit_res);
+			s = GRANT_LOCK(log);
+		}
+	}
+
+	need_bytes = tic->t_unit_res;
+
+redo:
+	if (XLOG_FORCED_SHUTDOWN(log))
+		goto error_return;
+
+	free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
+				     log->l_grant_write_bytes);
+	if (free_bytes < need_bytes) {
+		if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
+			XLOG_INS_TICKETQ(log->l_write_headq, tic);
+		XFS_STATS_INC(xfsstats.xs_sleep_logspace);
+		sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+
+		/* If we're shutting down, this tic is already off the queue */
+		if (XLOG_FORCED_SHUTDOWN(log)) {
+			s = GRANT_LOCK(log);
+			goto error_return;
+		}
+
+		xlog_trace_loggrant(log, tic,
+				    "xlog_regrant_write_log_space: wake 2");
+		xlog_grant_push_ail(log->l_mp, need_bytes);
+		s = GRANT_LOCK(log);
+		goto redo;
+	} else if (tic->t_flags & XLOG_TIC_IN_Q)
+		XLOG_DEL_TICKETQ(log->l_write_headq, tic);
+
+	XLOG_GRANT_ADD_SPACE(log, need_bytes, 'w'); /* we've got enough space */
+#ifdef DEBUG
+	tail_lsn = log->l_tail_lsn;
+	if (CYCLE_LSN(tail_lsn, ARCH_NOCONVERT) != log->l_grant_write_cycle) {
+		ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn, ARCH_NOCONVERT));
+		ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn, ARCH_NOCONVERT)));
+	}
+#endif
+
+	xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: exit");
+	xlog_verify_grant_head(log, 1);
+	GRANT_UNLOCK(log, s);
+	return (0);
+
+
+ error_return:
+	if (tic->t_flags & XLOG_TIC_IN_Q)
+		XLOG_DEL_TICKETQ(log->l_reserve_headq, tic);
+	xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: err_ret");
+	/*
+	 * If we are failing, make sure the ticket doesn't have any
+	 * current reservations. We don't want to add this back when
+	 * the ticket/transaction gets cancelled.
+	 */
+	tic->t_curr_res = 0;
+	tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
+	GRANT_UNLOCK(log, s);
+	return XFS_ERROR(EIO);
+}	/* xlog_regrant_write_log_space */
+
+
+/* The first cnt-1 times through here we don't need to
+ * move the grant write head because the permanent
+ * reservation has reserved cnt times the unit amount.
+ * Release part of current permanent unit reservation and
+ * reset current reservation to be one units worth.  Also
+ * move grant reservation head forward.
+ */
+STATIC void
+xlog_regrant_reserve_log_space(xlog_t	     *log,
+			       xlog_ticket_t *ticket)
+{
+	SPLDECL(s);
+
+	xlog_trace_loggrant(log, ticket,
+			    "xlog_regrant_reserve_log_space: enter");
+	if (ticket->t_cnt > 0)
+		ticket->t_cnt--;
+
+	s = GRANT_LOCK(log);
+	XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'w');
+	XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'r');
+	ticket->t_curr_res = ticket->t_unit_res;
+	xlog_trace_loggrant(log, ticket,
+			    "xlog_regrant_reserve_log_space: sub current res");
+	xlog_verify_grant_head(log, 1);
+
+	/* just return if we still have some of the pre-reserved space */
+	if (ticket->t_cnt > 0) {
+		GRANT_UNLOCK(log, s);
+		return;
+	}
+
+	XLOG_GRANT_ADD_SPACE(log, ticket->t_unit_res, 'r');
+	xlog_trace_loggrant(log, ticket,
+			    "xlog_regrant_reserve_log_space: exit");
+	xlog_verify_grant_head(log, 0);
+	GRANT_UNLOCK(log, s);
+	ticket->t_curr_res = ticket->t_unit_res;
+}	/* xlog_regrant_reserve_log_space */
+
+
+/*
+ * Give back the space left from a reservation.
+ *
+ * All the information we need to make a correct determination of space left
+ * is present.	For non-permanent reservations, things are quite easy.	The
+ * count should have been decremented to zero.	We only need to deal with the
+ * space remaining in the current reservation part of the ticket.  If the
+ * ticket contains a permanent reservation, there may be left over space which
+ * needs to be released.  A count of N means that N-1 refills of the current
+ * reservation can be done before we need to ask for more space.  The first
+ * one goes to fill up the first current reservation.  Once we run out of
+ * space, the count will stay at zero and the only space remaining will be
+ * in the current reservation field.
+ */
+STATIC void
+xlog_ungrant_log_space(xlog_t	     *log,
+		       xlog_ticket_t *ticket)
+{
+	SPLDECL(s);
+
+	if (ticket->t_cnt > 0)
+		ticket->t_cnt--;
+
+	s = GRANT_LOCK(log);
+	xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: enter");
+
+	XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'w');
+	XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'r');
+
+	xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: sub current");
+
+	/* If this is a permanent reservation ticket, we may be able to free
+	 * up more space based on the remaining count.
+	 */
+	if (ticket->t_cnt > 0) {
+		ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV);
+		XLOG_GRANT_SUB_SPACE(log, ticket->t_unit_res*ticket->t_cnt,'w');
+		XLOG_GRANT_SUB_SPACE(log, ticket->t_unit_res*ticket->t_cnt,'r');
+	}
+
+	xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: exit");
+	xlog_verify_grant_head(log, 1);
+	GRANT_UNLOCK(log, s);
+	xfs_log_move_tail(log->l_mp, 1);
+}	/* xlog_ungrant_log_space */
+
+
+/*
+ * If the lsn is not found or the iclog with the lsn is in the callback
+ * state, we need to call the function directly.  This is done outside
+ * this function's scope.  Otherwise, we insert the callback at the end
+ * of the iclog's callback list.
+ */
+int
+xlog_state_lsn_is_synced(xlog_t		    *log,
+			 xfs_lsn_t	    lsn,
+			 xfs_log_callback_t *cb,
+			 int		    *abortflg)
+{
+	xlog_in_core_t *iclog;
+	SPLDECL(s);
+	int	      lsn_is_synced = 1;
+
+	*abortflg = 0;
+	s = LOG_LOCK(log);
+
+	iclog = log->l_iclog;
+	do {
+		if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) != lsn) {
+			iclog = iclog->ic_next;
+			continue;
+		} else {
+			if (iclog->ic_state & XLOG_STATE_DIRTY) /* call it*/
+				break;
+
+			if (iclog->ic_state & XLOG_STATE_IOERROR) {
+				*abortflg = XFS_LI_ABORTED;
+				break;
+			}
+			/* insert callback onto end of list */
+			cb->cb_next = 0;
+			*(iclog->ic_callback_tail) = cb;
+			iclog->ic_callback_tail = &(cb->cb_next);
+			lsn_is_synced = 0;
+			break;
+		}
+	} while (iclog != log->l_iclog);
+
+	LOG_UNLOCK(log, s);
+	return lsn_is_synced;
+}	/* xlog_state_lsn_is_synced */
+
+
+/*
+ * Atomically put back used ticket.
+ */
+void
+xlog_state_put_ticket(xlog_t	    *log,
+		      xlog_ticket_t *tic)
+{
+	unsigned long s;
+
+	s = LOG_LOCK(log);
+	xlog_ticket_put(log, tic);
+	LOG_UNLOCK(log, s);
+}	/* xlog_state_put_ticket */
+
+void xlog_sync_sched(
+	void	*v)
+{
+	xlog_in_core_t	*iclog = (xlog_in_core_t *)v;
+	xlog_t		*log  = iclog->ic_log;
+
+	xlog_sync(log, iclog, 0);
+}
+
+int xlog_mode = 1;
+
+/*
+ * Flush iclog to disk if this is the last reference to the given iclog and
+ * the WANT_SYNC bit is set.
+ *
+ * When this function is entered, the iclog is not necessarily in the
+ * WANT_SYNC state.  It may be sitting around waiting to get filled.
+ *
+ *
+ */
+int
+xlog_state_release_iclog(xlog_t		*log,
+			 xlog_in_core_t *iclog)
+{
+	SPLDECL(s);
+	int		sync = 0;	/* do we sync? */
+
+	xlog_assign_tail_lsn(log->l_mp, 0);
+
+	s = LOG_LOCK(log);
+
+	if (iclog->ic_state & XLOG_STATE_IOERROR) {
+		LOG_UNLOCK(log, s);
+		return XFS_ERROR(EIO);
+	}
+
+	ASSERT(iclog->ic_refcnt > 0);
+	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE ||
+	       iclog->ic_state == XLOG_STATE_WANT_SYNC);
+
+	if (--iclog->ic_refcnt == 0 &&
+	    iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+		sync++;
+		iclog->ic_state = XLOG_STATE_SYNCING;
+		INT_SET(iclog->ic_header.h_tail_lsn, ARCH_CONVERT, log->l_tail_lsn);
+		xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn);
+		/* cycle incremented when incrementing curr_block */
+	}
+
+	LOG_UNLOCK(log, s);
+
+	/*
+	 * We let the log lock go, so it's possible that we hit a log I/O
+	 * error or someother SHUTDOWN condition that marks the iclog
+	 * as XLOG_STATE_IOERROR before the bwrite. However, we know that
+	 * this iclog has consistent data, so we ignore IOERROR
+	 * flags after this point.
+	 */
+	if (sync) {
+		INIT_TQUEUE(&iclog->ic_write_sched,
+			xlog_sync_sched, (void *) iclog);
+		switch (xlog_mode) {
+		case 0:
+			return xlog_sync(log, iclog, 0);
+		case 1:
+			pagebuf_queue_task(&iclog->ic_write_sched);
+		}
+	}
+	return (0);
+
+}	/* xlog_state_release_iclog */
+
+
+/*
+ * This routine will mark the current iclog in the ring as WANT_SYNC
+ * and move the current iclog pointer to the next iclog in the ring.
+ * When this routine is called from xlog_state_get_iclog_space(), the
+ * exact size of the iclog has not yet been determined.	 All we know is
+ * that every data block.  We have run out of space in this log record.
+ */
+STATIC void
+xlog_state_switch_iclogs(xlog_t		*log,
+			 xlog_in_core_t *iclog,
+			 int		eventual_size)
+{
+	uint roundup;
+
+	ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+	if (!eventual_size)
+		eventual_size = iclog->ic_offset;
+	iclog->ic_state = XLOG_STATE_WANT_SYNC;
+	INT_SET(iclog->ic_header.h_prev_block, ARCH_CONVERT, log->l_prev_block);
+	log->l_prev_block = log->l_curr_block;
+	log->l_prev_cycle = log->l_curr_cycle;
+
+	/* roll log?: ic_offset changed later */
+	log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize);
+
+	/* Round up to next log-sunit */
+	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+		if (log->l_curr_block & (log->l_mp->m_lstripemask - 1)) {
+			roundup = log->l_mp->m_lstripemask -
+				(log->l_curr_block &
+				 (log->l_mp->m_lstripemask - 1));
+		} else {
+			roundup = 0;
+		}
+		log->l_curr_block += roundup;
+	}
+
+	if (log->l_curr_block >= log->l_logBBsize) {
+		log->l_curr_cycle++;
+		if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
+			log->l_curr_cycle++;
+		log->l_curr_block -= log->l_logBBsize;
+		ASSERT(log->l_curr_block >= 0);
+	}
+	ASSERT(iclog == log->l_iclog);
+	log->l_iclog = iclog->ic_next;
+}	/* xlog_state_switch_iclogs */
+
+
+/*
+ * Write out all data in the in-core log as of this exact moment in time.
+ *
+ * Data may be written to the in-core log during this call.  However,
+ * we don't guarantee this data will be written out.  A change from past
+ * implementation means this routine will *not* write out zero length LRs.
+ *
+ * Basically, we try and perform an intelligent scan of the in-core logs.
+ * If we determine there is no flushable data, we just return.	There is no
+ * flushable data if:
+ *
+ *	1. the current iclog is active and has no data; the previous iclog
+ *		is in the active or dirty state.
+ *	2. the current iclog is drity, and the previous iclog is in the
+ *		active or dirty state.
+ *
+ * We may sleep (call psema) if:
+ *
+ *	1. the current iclog is not in the active nor dirty state.
+ *	2. the current iclog dirty, and the previous iclog is not in the
+ *		active nor dirty state.
+ *	3. the current iclog is active, and there is another thread writing
+ *		to this particular iclog.
+ *	4. a) the current iclog is active and has no other writers
+ *	   b) when we return from flushing out this iclog, it is still
+ *		not in the active nor dirty state.
+ */
+STATIC int
+xlog_state_sync_all(xlog_t *log, uint flags)
+{
+	xlog_in_core_t	*iclog;
+	xfs_lsn_t	lsn;
+	SPLDECL(s);
+
+	s = LOG_LOCK(log);
+
+	iclog = log->l_iclog;
+	if (iclog->ic_state & XLOG_STATE_IOERROR) {
+		LOG_UNLOCK(log, s);
+		return XFS_ERROR(EIO);
+	}
+
+	/* If the head iclog is not active nor dirty, we just attach
+	 * ourselves to the head and go to sleep.
+	 */
+	if (iclog->ic_state == XLOG_STATE_ACTIVE ||
+	    iclog->ic_state == XLOG_STATE_DIRTY) {
+		/*
+		 * If the head is dirty or (active and empty), then
+		 * we need to look at the previous iclog.  If the previous
+		 * iclog is active or dirty we are done.  There is nothing
+		 * to sync out.	 Otherwise, we attach ourselves to the
+		 * previous iclog and go to sleep.
+		 */
+		if (iclog->ic_state == XLOG_STATE_DIRTY ||
+		    (iclog->ic_refcnt == 0 && iclog->ic_offset == 0)) {
+			iclog = iclog->ic_prev;
+			if (iclog->ic_state == XLOG_STATE_ACTIVE ||
+			    iclog->ic_state == XLOG_STATE_DIRTY)
+				goto no_sleep;
+			else
+				goto maybe_sleep;
+		} else {
+			if (iclog->ic_refcnt == 0) {
+				/* We are the only one with access to this
+				 * iclog.  Flush it out now.  There should
+				 * be a roundoff of zero to show that someone
+				 * has already taken care of the roundoff from
+				 * the previous sync.
+				 */
+				ASSERT(iclog->ic_roundoff == 0);
+				iclog->ic_refcnt++;
+				lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT);
+				xlog_state_switch_iclogs(log, iclog, 0);
+				LOG_UNLOCK(log, s);
+
+				if (xlog_state_release_iclog(log, iclog))
+					return XFS_ERROR(EIO);
+				s = LOG_LOCK(log);
+				if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) == lsn &&
+				    iclog->ic_state != XLOG_STATE_DIRTY)
+					goto maybe_sleep;
+				else
+					goto no_sleep;
+			} else {
+				/* Someone else is writing to this iclog.
+				 * Use its call to flush out the data.	However,
+				 * the other thread may not force out this LR,
+				 * so we mark it WANT_SYNC.
+				 */
+				xlog_state_switch_iclogs(log, iclog, 0);
+				goto maybe_sleep;
+			}
+		}
+	}
+
+	/* By the time we come around again, the iclog could've been filled
+	 * which would give it another lsn.  If we have a new lsn, just
+	 * return because the relevant data has been flushed.
+	 */
+maybe_sleep:
+	if (flags & XFS_LOG_SYNC) {
+		/*
+		 * We must check if we're shutting down here, before
+		 * we wait, while we're holding the LOG_LOCK.
+		 * Then we check again after waking up, in case our
+		 * sleep was disturbed by a bad news.
+		 */
+		if (iclog->ic_state & XLOG_STATE_IOERROR) {
+			LOG_UNLOCK(log, s);
+			return XFS_ERROR(EIO);
+		}
+		XFS_STATS_INC(xfsstats.xs_log_force_sleep);
+		sv_wait(&iclog->ic_forcesema, PINOD, &log->l_icloglock, s);
+		/*
+		 * No need to grab the log lock here since we're
+		 * only deciding whether or not to return EIO
+		 * and the memory read should be atomic.
+		 */
+		if (iclog->ic_state & XLOG_STATE_IOERROR)
+			return XFS_ERROR(EIO);
+
+	} else {
+
+no_sleep:
+		LOG_UNLOCK(log, s);
+	}
+	return 0;
+}	/* xlog_state_sync_all */
+
+
+/*
+ * Used by code which implements synchronous log forces.
+ *
+ * Find in-core log with lsn.
+ *	If it is in the DIRTY state, just return.
+ *	If it is in the ACTIVE state, move the in-core log into the WANT_SYNC
+ *		state and go to sleep or return.
+ *	If it is in any other state, go to sleep or return.
+ *
+ * If filesystem activity goes to zero, the iclog will get flushed only by
+ * bdflush().
+ */
+int
+xlog_state_sync(xlog_t	  *log,
+		xfs_lsn_t lsn,
+		uint	  flags)
+{
+    xlog_in_core_t	*iclog;
+    int			already_slept = 0;
+    SPLDECL(s);
+
+
+try_again:
+    s = LOG_LOCK(log);
+    iclog = log->l_iclog;
+
+    if (iclog->ic_state & XLOG_STATE_IOERROR) {
+	    LOG_UNLOCK(log, s);
+	    return XFS_ERROR(EIO);
+    }
+
+    do {
+	if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) != lsn) {
+	    iclog = iclog->ic_next;
+	    continue;
+	}
+
+	if (iclog->ic_state == XLOG_STATE_DIRTY) {
+		LOG_UNLOCK(log, s);
+		return 0;
+	}
+
+	if (iclog->ic_state == XLOG_STATE_ACTIVE) {
+		/*
+		 * We sleep here if we haven't already slept (e.g.
+		 * this is the first time we've looked at the correct
+		 * iclog buf) and the buffer before us is going to
+		 * be sync'ed.	We have to do that to ensure that the
+		 * log records go out in the proper order.  When it's
+		 * done, someone waiting on this buffer will be woken up
+		 * (maybe us) to flush this buffer out.
+		 *
+		 * Otherwise, we mark the buffer WANT_SYNC, and bump
+		 * up the refcnt so we can release the log (which drops
+		 * the ref count).  The state switch keeps new transaction
+		 * commits from using this buffer.  When the current commits
+		 * finish writing into the buffer, the refcount will drop to
+		 * zero and the buffer will go out then.
+		 */
+		if (!already_slept &&
+		    (iclog->ic_prev->ic_state & (XLOG_STATE_WANT_SYNC |
+						 XLOG_STATE_SYNCING))) {
+			ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
+			XFS_STATS_INC(xfsstats.xs_log_force_sleep);
+			sv_wait(&iclog->ic_prev->ic_forcesema, PSWP,
+				&log->l_icloglock, s);
+			already_slept = 1;
+			goto try_again;
+		} else {
+			iclog->ic_refcnt++;
+			xlog_state_switch_iclogs(log, iclog, 0);
+			LOG_UNLOCK(log, s);
+			if (xlog_state_release_iclog(log, iclog))
+				return XFS_ERROR(EIO);
+			s = LOG_LOCK(log);
+		}
+	}
+
+	if ((flags & XFS_LOG_SYNC) && /* sleep */
+	    !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
+
+		/*
+		 * Don't wait on the forcesema if we know that we've
+		 * gotten a log write error.
+		 */
+		if (iclog->ic_state & XLOG_STATE_IOERROR) {
+			LOG_UNLOCK(log, s);
+			return XFS_ERROR(EIO);
+		}
+		XFS_STATS_INC(xfsstats.xs_log_force_sleep);
+		sv_wait(&iclog->ic_forcesema, PSWP, &log->l_icloglock, s);
+		/*
+		 * No need to grab the log lock here since we're
+		 * only deciding whether or not to return EIO
+		 * and the memory read should be atomic.
+		 */
+		if (iclog->ic_state & XLOG_STATE_IOERROR)
+			return XFS_ERROR(EIO);
+	} else {		/* just return */
+		LOG_UNLOCK(log, s);
+	}
+	return 0;
+
+    } while (iclog != log->l_iclog);
+
+    LOG_UNLOCK(log, s);
+    return (0);
+}	/* xlog_state_sync */
+
+
+/*
+ * Called when we want to mark the current iclog as being ready to sync to
+ * disk.
+ */
+void
+xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
+{
+	SPLDECL(s);
+
+	s = LOG_LOCK(log);
+
+	if (iclog->ic_state == XLOG_STATE_ACTIVE) {
+		xlog_state_switch_iclogs(log, iclog, 0);
+	} else {
+		ASSERT(iclog->ic_state &
+			(XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR));
+	}
+
+	LOG_UNLOCK(log, s);
+}	/* xlog_state_want_sync */
+
+
+
+/*****************************************************************************
+ *
+ *		TICKET functions
+ *
+ *****************************************************************************
+ */
+
+/*
+ *	Algorithm doesn't take into account page size. ;-(
+ */
+STATIC void
+xlog_state_ticket_alloc(xlog_t *log)
+{
+	xlog_ticket_t	*t_list;
+	xlog_ticket_t	*next;
+	xfs_caddr_t	buf;
+	uint		i = (NBPP / sizeof(xlog_ticket_t)) - 2;
+	SPLDECL(s);
+
+	/*
+	 * The kmem_zalloc may sleep, so we shouldn't be holding the
+	 * global lock.	 XXXmiken: may want to use zone allocator.
+	 */
+	buf = (xfs_caddr_t) kmem_zalloc(NBPP, 0);
+
+	s = LOG_LOCK(log);
+
+	/* Attach 1st ticket to Q, so we can keep track of allocated memory */
+	t_list = (xlog_ticket_t *)buf;
+	t_list->t_next = log->l_unmount_free;
+	log->l_unmount_free = t_list++;
+	log->l_ticket_cnt++;
+	log->l_ticket_tcnt++;
+
+	/* Next ticket becomes first ticket attached to ticket free list */
+	if (log->l_freelist != NULL) {
+		ASSERT(log->l_tail != NULL);
+		log->l_tail->t_next = t_list;
+	} else {
+		log->l_freelist = t_list;
+	}
+	log->l_ticket_cnt++;
+	log->l_ticket_tcnt++;
+
+	/* Cycle through rest of alloc'ed memory, building up free Q */
+	for ( ; i > 0; i--) {
+		next = t_list + 1;
+		t_list->t_next = next;
+		t_list = next;
+		log->l_ticket_cnt++;
+		log->l_ticket_tcnt++;
+	}
+	t_list->t_next = 0;
+	log->l_tail = t_list;
+	LOG_UNLOCK(log, s);
+}	/* xlog_state_ticket_alloc */
+
+
+/*
+ * Put ticket into free list
+ *
+ * Assumption: log lock is held around this call.
+ */
+STATIC void
+xlog_ticket_put(xlog_t		*log,
+		xlog_ticket_t	*ticket)
+{
+	sv_destroy(&ticket->t_sema);
+
+	/*
+	 * Don't think caching will make that much difference.	It's
+	 * more important to make debug easier.
+	 */
+#if 0
+	/* real code will want to use LIFO for caching */
+	ticket->t_next = log->l_freelist;
+	log->l_freelist = ticket;
+	/* no need to clear fields */
+#else
+	/* When we debug, it is easier if tickets are cycled */
+	ticket->t_next	   = 0;
+	if (log->l_tail != 0) {
+		log->l_tail->t_next = ticket;
+	} else {
+		ASSERT(log->l_freelist == 0);
+		log->l_freelist = ticket;
+	}
+	log->l_tail	    = ticket;
+#endif /* DEBUG */
+	log->l_ticket_cnt++;
+}	/* xlog_ticket_put */
+
+
+/*
+ * Grab ticket off freelist or allocation some more
+ */
+xlog_ticket_t *
+xlog_ticket_get(xlog_t		*log,
+		int		unit_bytes,
+		int		cnt,
+		char		client,
+		uint		xflags)
+{
+	xlog_ticket_t	*tic;
+	SPLDECL(s);
+
+ alloc:
+	if (log->l_freelist == NULL)
+		xlog_state_ticket_alloc(log);		/* potentially sleep */
+
+	s = LOG_LOCK(log);
+	if (log->l_freelist == NULL) {
+		LOG_UNLOCK(log, s);
+		goto alloc;
+	}
+	tic		= log->l_freelist;
+	log->l_freelist = tic->t_next;
+	if (log->l_freelist == NULL)
+		log->l_tail = NULL;
+	log->l_ticket_cnt--;
+	LOG_UNLOCK(log, s);
+
+	/*
+	 * Permanent reservations have up to 'cnt'-1 active log operations
+	 * in the log.	A unit in this case is the amount of space for one
+	 * of these log operations.  Normal reservations have a cnt of 1
+	 * and their unit amount is the total amount of space required.
+	 * The following line of code adds one log record header length
+	 * for each part of an operation which may fall on a different
+	 * log record.
+	 *
+	 * One more XLOG_HEADER_SIZE is added to account for possible
+	 * round off errors when syncing a LR to disk.	The bytes are
+	 * subtracted if the thread using this ticket is the first writer
+	 * to a new LR.
+	 *
+	 * We add an extra log header for the possibility that the commit
+	 * record is the first data written to a new log record.  In this
+	 * case it is separate from the rest of the transaction data and
+	 * will be charged for the log record header.
+	 */
+	unit_bytes += log->l_iclog_hsize * (XLOG_BTOLRBB(unit_bytes) + 2);
+
+	tic->t_unit_res		= unit_bytes;
+	tic->t_curr_res		= unit_bytes;
+	tic->t_cnt		= cnt;
+	tic->t_ocnt		= cnt;
+	tic->t_tid		= (xlog_tid_t)((__psint_t)tic & 0xffffffff);
+	tic->t_clientid		= client;
+	tic->t_flags		= XLOG_TIC_INITED;
+	if (xflags & XFS_LOG_PERM_RESERV)
+		tic->t_flags |= XLOG_TIC_PERM_RESERV;
+	sv_init(&(tic->t_sema), SV_DEFAULT, "logtick");
+
+	return tic;
+}	/* xlog_ticket_get */
+
+
+/******************************************************************************
+ *
+ *		Log debug routines
+ *
+ ******************************************************************************
+ */
+#if defined(DEBUG) && !defined(XLOG_NOLOG)
+/*
+ * Make sure that the destination ptr is within the valid data region of
+ * one of the iclogs.  This uses backup pointers stored in a different
+ * part of the log in case we trash the log structure.
+ */
+void
+xlog_verify_dest_ptr(xlog_t	*log,
+		     __psint_t	ptr)
+{
+	int i;
+	int good_ptr = 0;
+
+	for (i=0; i < log->l_iclog_bufs; i++) {
+		if (ptr >= (__psint_t)log->l_iclog_bak[i] &&
+		    ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size)
+			good_ptr++;
+	}
+	if (! good_ptr)
+		xlog_panic("xlog_verify_dest_ptr: invalid ptr");
+}	/* xlog_verify_dest_ptr */
+
+
+#ifdef XFSDEBUG
+/* check split LR write */
+STATIC void
+xlog_verify_disk_cycle_no(xlog_t	 *log,
+			  xlog_in_core_t *iclog)
+{
+    xfs_buf_t	*bp;
+    uint	cycle_no;
+    xfs_daddr_t i;
+
+    if (BLOCK_LSN(iclog->ic_header.h_lsn, ARCH_CONVERT) < 10) {
+	cycle_no = CYCLE_LSN(iclog->ic_header.h_lsn, ARCH_CONVERT);
+	bp = xlog_get_bp(1, log->l_mp);
+	ASSERT(bp);
+	for (i = 0; i < BLOCK_LSN(iclog->ic_header.h_lsn, ARCH_CONVERT); i++) {
+	    xlog_bread(log, i, 1, bp);
+	    if (GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT) != cycle_no)
+		xlog_warn("XFS: xlog_verify_disk_cycle_no: bad cycle no");
+	}
+	xlog_put_bp(bp);
+    }
+}	/* xlog_verify_disk_cycle_no */
+#endif
+
+STATIC void
+xlog_verify_grant_head(xlog_t *log, int equals)
+{
+    if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) {
+	if (equals)
+	    ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes);
+	else
+	    ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes);
+    } else {
+	ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle);
+	ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes);
+    }
+}	/* xlog_verify_grant_head */
+
+/* check if it will fit */
+STATIC void
+xlog_verify_tail_lsn(xlog_t	    *log,
+		     xlog_in_core_t *iclog,
+		     xfs_lsn_t	    tail_lsn)
+{
+    int blocks;
+
+    if (CYCLE_LSN(tail_lsn, ARCH_NOCONVERT) == log->l_prev_cycle) {
+	blocks =
+	    log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn, ARCH_NOCONVERT));
+	if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
+	    xlog_panic("xlog_verify_tail_lsn: ran out of log space");
+    } else {
+	ASSERT(CYCLE_LSN(tail_lsn, ARCH_NOCONVERT)+1 == log->l_prev_cycle);
+
+	if (BLOCK_LSN(tail_lsn, ARCH_NOCONVERT) == log->l_prev_block)
+	    xlog_panic("xlog_verify_tail_lsn: tail wrapped");
+
+	blocks = BLOCK_LSN(tail_lsn, ARCH_NOCONVERT) - log->l_prev_block;
+	if (blocks < BTOBB(iclog->ic_offset) + 1)
+	    xlog_panic("xlog_verify_tail_lsn: ran out of log space");
+    }
+}	/* xlog_verify_tail_lsn */
+
+/*
+ * Perform a number of checks on the iclog before writing to disk.
+ *
+ * 1. Make sure the iclogs are still circular
+ * 2. Make sure we have a good magic number
+ * 3. Make sure we don't have magic numbers in the data
+ * 4. Check fields of each log operation header for:
+ *	A. Valid client identifier
+ *	B. tid ptr value falls in valid ptr space (user space code)
+ *	C. Length in log record header is correct according to the
+ *		individual operation headers within record.
+ * 5. When a bwrite will occur within 5 blocks of the front of the physical
+ *	log, check the preceding blocks of the physical log to make sure all
+ *	the cycle numbers agree with the current cycle number.
+ */
+STATIC void
+xlog_verify_iclog(xlog_t	 *log,
+		  xlog_in_core_t *iclog,
+		  int		 count,
+		  boolean_t	 syncing)
+{
+	xlog_op_header_t	*ophead;
+	xlog_in_core_t		*icptr;
+	xfs_caddr_t		ptr;
+	xfs_caddr_t		base_ptr;
+	__psint_t		field_offset;
+	__uint8_t		clientid;
+	int			len, i, j, k, op_len;
+	int			idx;
+	SPLDECL(s);
+
+	union ich {
+		xlog_rec_ext_header_t	hic_xheader;
+		char			hic_sector[XLOG_HEADER_SIZE];
+	}*xhdr;
+
+	/* check validity of iclog pointers */
+	s = LOG_LOCK(log);
+	icptr = log->l_iclog;
+	for (i=0; i < log->l_iclog_bufs; i++) {
+		if (icptr == 0)
+			xlog_panic("xlog_verify_iclog: illegal ptr");
+		icptr = icptr->ic_next;
+	}
+	if (icptr != log->l_iclog)
+		xlog_panic("xlog_verify_iclog: corrupt iclog ring");
+	LOG_UNLOCK(log, s);
+
+	/* check log magic numbers */
+	ptr = (xfs_caddr_t) &(iclog->ic_header);
+	if (INT_GET(*(uint *)ptr, ARCH_CONVERT) != XLOG_HEADER_MAGIC_NUM)
+		xlog_panic("xlog_verify_iclog: illegal magic num");
+
+	for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&(iclog->ic_header))+count;
+	     ptr += BBSIZE) {
+		if (INT_GET(*(uint *)ptr, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM)
+			xlog_panic("xlog_verify_iclog: unexpected magic num");
+	}
+
+	/* check fields */
+	len = INT_GET(iclog->ic_header.h_num_logops, ARCH_CONVERT);
+	ptr = iclog->ic_datap;
+	base_ptr = ptr;
+	ophead = (xlog_op_header_t *)ptr;
+	xhdr = (union ich*)&iclog->ic_header;
+	for (i = 0; i < len; i++) {
+		ophead = (xlog_op_header_t *)ptr;
+
+		/* clientid is only 1 byte */
+		field_offset = (__psint_t)
+			       ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr);
+		if (syncing == B_FALSE || (field_offset & 0x1ff)) {
+			clientid = ophead->oh_clientid;
+		} else {
+			idx = BTOBB((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap);
+			if (idx > (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
+				j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+				k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+				clientid = GET_CLIENT_ID(xhdr[j].hic_xheader.xh_cycle_data[k], ARCH_CONVERT);
+			} else {
+				clientid = GET_CLIENT_ID(iclog->ic_header.h_cycle_data[idx], ARCH_CONVERT);
+			}
+		}
+		if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
+			cmn_err(CE_WARN, "xlog_verify_iclog: illegal clientid %d op 0x%p offset 0x%x", clientid, ophead, field_offset);
+
+		/* check length */
+		field_offset = (__psint_t)
+			       ((xfs_caddr_t)&(ophead->oh_len) - base_ptr);
+		if (syncing == B_FALSE || (field_offset & 0x1ff)) {
+			op_len = INT_GET(ophead->oh_len, ARCH_CONVERT);
+		} else {
+			idx = BTOBB((__psint_t)&ophead->oh_len -
+				    (__psint_t)iclog->ic_datap);
+			if (idx > (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
+				j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+				k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+				op_len = INT_GET(xhdr[j].hic_xheader.xh_cycle_data[k], ARCH_CONVERT);
+			} else {
+				op_len = INT_GET(iclog->ic_header.h_cycle_data[idx], ARCH_CONVERT);
+			}
+		}
+		ptr += sizeof(xlog_op_header_t) + op_len;
+	}
+}	/* xlog_verify_iclog */
+#endif /* DEBUG && !XLOG_NOLOG */
+
+/*
+ * Mark all iclogs IOERROR. LOG_LOCK is held by the caller.
+ */
+STATIC int
+xlog_state_ioerror(
+	xlog_t	*log)
+{
+	xlog_in_core_t	*iclog, *ic;
+
+	iclog = log->l_iclog;
+	if (! (iclog->ic_state & XLOG_STATE_IOERROR)) {
+		/*
+		 * Mark all the incore logs IOERROR.
+		 * From now on, no log flushes will result.
+		 */
+		ic = iclog;
+		do {
+			ic->ic_state = XLOG_STATE_IOERROR;
+			ic = ic->ic_next;
+		} while (ic != iclog);
+		return (0);
+	}
+	/*
+	 * Return non-zero, if state transition has already happened.
+	 */
+	return (1);
+}
+
+/*
+ * This is called from xfs_force_shutdown, when we're forcibly
+ * shutting down the filesystem, typically because of an IO error.
+ * Our main objectives here are to make sure that:
+ *	a. the filesystem gets marked 'SHUTDOWN' for all interested
+ *	   parties to find out, 'atomically'.
+ *	b. those who're sleeping on log reservations, pinned objects and
+ *	    other resources get woken up, and be told the bad news.
+ *	c. nothing new gets queued up after (a) and (b) are done.
+ *	d. if !logerror, flush the iclogs to disk, then seal them off
+ *	   for business.
+ */
+int
+xfs_log_force_umount(
+	struct xfs_mount	*mp,
+	int			logerror)
+{
+	xlog_ticket_t	*tic;
+	xlog_t		*log;
+	int		retval;
+	SPLDECL(s);
+	SPLDECL(s2);
+
+	log = mp->m_log;
+
+	/*
+	 * If this happens during log recovery, don't worry about
+	 * locking; the log isn't open for business yet.
+	 */
+	if (!log ||
+	    log->l_flags & XLOG_ACTIVE_RECOVERY) {
+		mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
+		XFS_BUF_DONE(mp->m_sb_bp);
+		return (0);
+	}
+
+	/*
+	 * Somebody could've already done the hard work for us.
+	 * No need to get locks for this.
+	 */
+	if (logerror && log->l_iclog->ic_state & XLOG_STATE_IOERROR) {
+		ASSERT(XLOG_FORCED_SHUTDOWN(log));
+		return (1);
+	}
+	retval = 0;
+	/*
+	 * We must hold both the GRANT lock and the LOG lock,
+	 * before we mark the filesystem SHUTDOWN and wake
+	 * everybody up to tell the bad news.
+	 */
+	s = GRANT_LOCK(log);
+	s2 = LOG_LOCK(log);
+	mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
+	XFS_BUF_DONE(mp->m_sb_bp);
+	/*
+	 * This flag is sort of redundant because of the mount flag, but
+	 * it's good to maintain the separation between the log and the rest
+	 * of XFS.
+	 */
+	log->l_flags |= XLOG_IO_ERROR;
+
+	/*
+	 * If we hit a log error, we want to mark all the iclogs IOERROR
+	 * while we're still holding the loglock.
+	 */
+	if (logerror)
+		retval = xlog_state_ioerror(log);
+	LOG_UNLOCK(log, s2);
+
+	/*
+	 * We don't want anybody waiting for log reservations
+	 * after this. That means we have to wake up everybody
+	 * queued up on reserve_headq as well as write_headq.
+	 * In addition, we make sure in xlog_{re}grant_log_space
+	 * that we don't enqueue anything once the SHUTDOWN flag
+	 * is set, and this action is protected by the GRANTLOCK.
+	 */
+	if ((tic = log->l_reserve_headq)) {
+		do {
+			sv_signal(&tic->t_sema);
+			tic = tic->t_next;
+		} while (tic != log->l_reserve_headq);
+	}
+
+	if ((tic = log->l_write_headq)) {
+		do {
+			sv_signal(&tic->t_sema);
+			tic = tic->t_next;
+		} while (tic != log->l_write_headq);
+	}
+	GRANT_UNLOCK(log, s);
+
+	if (! (log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
+		ASSERT(!logerror);
+		/*
+		 * Force the incore logs to disk before shutting the
+		 * log down completely.
+		 */
+		xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC);
+		s2 = LOG_LOCK(log);
+		retval = xlog_state_ioerror(log);
+		LOG_UNLOCK(log, s2);
+	}
+	/*
+	 * Wake up everybody waiting on xfs_log_force.
+	 * Callback all log item committed functions as if the
+	 * log writes were completed.
+	 */
+	xlog_state_do_callback(log, XFS_LI_ABORTED, NULL);
+
+#ifdef XFSERRORDEBUG
+	{
+		xlog_in_core_t	*iclog;
+
+		s = LOG_LOCK(log);
+		iclog = log->l_iclog;
+		do {
+			ASSERT(iclog->ic_callback == 0);
+			iclog = iclog->ic_next;
+		} while (iclog != log->l_iclog);
+		LOG_UNLOCK(log, s);
+	}
+#endif
+	/* return non-zero if log IOERROR transition had already happened */
+	return (retval);
+}
+
+int
+xlog_iclogs_empty(xlog_t *log)
+{
+	xlog_in_core_t	*iclog;
+
+	iclog = log->l_iclog;
+	do {
+		if (INT_GET(iclog->ic_header.h_num_logops, ARCH_CONVERT))
+			return(0);
+		iclog = iclog->ic_next;
+	} while (iclog != log->l_iclog);
+	return(1);
+}
+
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
new file mode 100644
index 000000000000..fec59717ab2e
--- /dev/null
+++ b/fs/xfs/xfs_log.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_LOG_H__
+#define __XFS_LOG_H__
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define LSN_FIELD_CYCLE(arch) (((arch)==ARCH_NOCONVERT)?1:0)
+#define LSN_FIELD_BLOCK(arch) (((arch)==ARCH_NOCONVERT)?0:1)
+#else
+#define LSN_FIELD_CYCLE(arch) (0)
+#define LSN_FIELD_BLOCK(arch) (1)
+#endif
+
+/* get lsn fields */
+
+#define CYCLE_LSN(lsn,arch) (INT_GET(((uint *)&(lsn))[LSN_FIELD_CYCLE(arch)], arch))
+#define BLOCK_LSN(lsn,arch) (INT_GET(((uint *)&(lsn))[LSN_FIELD_BLOCK(arch)], arch))
+/* this is used in a spot where we might otherwise double-endian-flip */
+#define CYCLE_LSN_NOCONV(lsn,arch) (((uint *)&(lsn))[LSN_FIELD_CYCLE(arch)])
+
+#ifdef __KERNEL__
+/*
+ * By comparing each compnent, we don't have to worry about extra
+ * endian issues in treating two 32 bit numbers as one 64 bit number
+ */
+static
+#ifdef __GNUC__
+# if !((__GNUC__ == 2) && (__GNUC_MINOR__ == 95))
+__inline__
+#endif
+#endif
+xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2, xfs_arch_t arch)
+{
+	if (CYCLE_LSN(lsn1, arch) != CYCLE_LSN(lsn2, arch))
+		return (CYCLE_LSN(lsn1, arch)<CYCLE_LSN(lsn2, arch))? -999 : 999;
+
+	if (BLOCK_LSN(lsn1, arch) != BLOCK_LSN(lsn2, arch))
+		return (BLOCK_LSN(lsn1, arch)<BLOCK_LSN(lsn2, arch))? -999 : 999;
+
+	return 0;
+}
+
+#define XFS_LSN_CMP_ARCH(x,y,arch)	_lsn_cmp(x, y, arch)
+#define XFS_LSN_CMP(x,y) XFS_LSN_CMP_ARCH(x,y,ARCH_NOCONVERT)
+#define XFS_LSN_DIFF_ARCH(x,y,arch)	_lsn_cmp(x, y, arch)
+#define XFS_LSN_DIFF(x,y) XFS_LSN_DIFF_ARCH(x,y,ARCH_NOCONVERT)
+
+/*
+ * Macros, structures, prototypes for interface to the log manager.
+ */
+
+/*
+ * Flags to xfs_log_mount
+ */
+#define XFS_LOG_RECOVER		0x1
+
+/*
+ * Flags to xfs_log_done()
+ */
+#define XFS_LOG_REL_PERM_RESERV 0x1
+
+
+/*
+ * Flags to xfs_log_reserve()
+ *
+ *	XFS_LOG_SLEEP:	 If space is not available, sleep (default)
+ *	XFS_LOG_NOSLEEP: If space is not available, return error
+ *	XFS_LOG_PERM_RESERV: Permanent reservation.  When writes are
+ *		performed against this type of reservation, the reservation
+ *		is not decreased.  Long running transactions should use this.
+ */
+#define XFS_LOG_SLEEP		0x0
+#define XFS_LOG_NOSLEEP		0x1
+#define XFS_LOG_PERM_RESERV	0x2
+#define XFS_LOG_RESV_ALL	(XFS_LOG_NOSLEEP|XFS_LOG_PERM_RESERV)
+
+
+/*
+ * Flags to xfs_log_force()
+ *
+ *	XFS_LOG_SYNC:	Synchronous force in-core log to disk
+ *	XFS_LOG_FORCE:	Start in-core log write now.
+ *	XFS_LOG_URGE:	Start write within some window of time.
+ *
+ * Note: Either XFS_LOG_FORCE or XFS_LOG_URGE must be set.
+ */
+#define XFS_LOG_SYNC		0x1
+#define XFS_LOG_FORCE		0x2
+#define XFS_LOG_URGE		0x4
+
+#endif	/* __KERNEL__ */
+
+
+/* Log Clients */
+#define XFS_TRANSACTION		0x69
+#define XFS_VOLUME		0x2
+#define XFS_LOG			0xaa
+
+typedef struct xfs_log_iovec {
+	xfs_caddr_t		i_addr;		/* beginning address of region */
+	int		i_len;		/* length in bytes of region */
+} xfs_log_iovec_t;
+
+typedef void* xfs_log_ticket_t;
+
+/*
+ * Structure used to pass callback function and the function's argument
+ * to the log manager.
+ */
+typedef struct xfs_log_callback {
+	struct xfs_log_callback *cb_next;
+	void			(*cb_func)(void *, int);
+	void			*cb_arg;
+} xfs_log_callback_t;
+
+
+#ifdef __KERNEL__
+/* Log manager interfaces */
+struct xfs_mount;
+xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
+		       xfs_log_ticket_t ticket,
+		       uint		flags);
+int	  xfs_log_force(struct xfs_mount *mp,
+			xfs_lsn_t	 lsn,
+			uint		 flags);
+int	  xfs_log_init(void);
+int	  xfs_log_mount(struct xfs_mount *mp,
+			dev_t		 log_dev,
+			xfs_daddr_t		 start_block,
+			int		 num_bblocks);
+int	  xfs_log_mount_finish(struct xfs_mount *mp, int);
+void	  xfs_log_move_tail(struct xfs_mount	*mp,
+			    xfs_lsn_t		tail_lsn);
+void	  xfs_log_notify(struct xfs_mount	*mp,
+			 xfs_lsn_t		lsn,
+			 xfs_log_callback_t	*callback_entry);
+int	  xfs_log_reserve(struct xfs_mount *mp,
+			  int		   length,
+			  int		   count,
+			  xfs_log_ticket_t *ticket,
+			  __uint8_t	   clientid,
+			  uint		   flags);
+int	  xfs_log_write(struct xfs_mount *mp,
+			xfs_log_iovec_t	 region[],
+			int		 nentries,
+			xfs_log_ticket_t ticket,
+			xfs_lsn_t	 *start_lsn);
+int	  xfs_log_unmount(struct xfs_mount *mp);
+int	  xfs_log_unmount_write(struct xfs_mount *mp);
+void	  xfs_log_unmount_dealloc(struct xfs_mount *mp);
+int	  xfs_log_force_umount(struct xfs_mount *mp, int logerror);
+int	  xfs_log_need_covered(struct xfs_mount *mp);
+
+void	  xlog_iodone(struct xfs_buf *);
+
+#endif
+
+
+extern int xlog_debug;		/* set to 1 to enable real log */
+
+
+#endif	/* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
new file mode 100644
index 000000000000..9bdce316dd85
--- /dev/null
+++ b/fs/xfs/xfs_log_priv.h
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_LOG_PRIV_H__
+#define __XFS_LOG_PRIV_H__
+
+#if defined(XFS_ALL_TRACE)
+#define XFS_LOG_TRACE
+#endif
+
+#if !defined(DEBUG)
+#undef XFS_LOG_TRACE
+#endif
+
+struct xfs_buf;
+struct ktrace;
+struct log;
+struct xfs_buf_cancel;
+struct xfs_mount;
+
+/*
+ * Macros, structures, prototypes for internal log manager use.
+ */
+
+#define XLOG_NUM_ICLOGS		2
+#define XLOG_MAX_ICLOGS		8
+#define XLOG_CALLBACK_SIZE	10
+#define XLOG_HEADER_MAGIC_NUM	0xFEEDbabe	/* Illegal cycle number */
+#define XLOG_VERSION_1		1
+#define XLOG_VERSION_2		2		/* Large IClogs, Log sunit */
+#define XLOG_VERSION_OKBITS	(XLOG_VERSION_1 | XLOG_VERSION_2)
+#define XLOG_RECORD_BSIZE	(16*1024)	/* eventually 32k */
+#define XLOG_BIG_RECORD_BSIZE	(32*1024)	/* 32k buffers */
+#define XLOG_MAX_RECORD_BSIZE	(256*1024)
+#define XLOG_HEADER_CYCLE_SIZE	(32*1024)	/* cycle data in header */
+#define XLOG_RECORD_BSHIFT	14		/* 16384 == 1 << 14 */
+#define XLOG_BIG_RECORD_BSHIFT	15		/* 32k == 1 << 15 */
+#define XLOG_MAX_RECORD_BSHIFT	18		/* 256k == 1 << 18 */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XLOG_BTOLRBB)
+int xlog_btolrbb(int b);
+#define XLOG_BTOLRBB(b)		xlog_btolrbb(b)
+#else
+#define XLOG_BTOLRBB(b)		(((b)+XLOG_RECORD_BSIZE-1) >> XLOG_RECORD_BSHIFT)
+#endif
+
+#define XLOG_HEADER_SIZE	512
+
+#define XLOG_TOTAL_REC_SHIFT(log) \
+	BTOBB(XLOG_MAX_ICLOGS << (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? \
+	 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
+
+/*
+ *  set lsns
+ */
+
+#define ASSIGN_LSN_CYCLE(lsn,cycle,arch) \
+    INT_SET(((uint *)&(lsn))[LSN_FIELD_CYCLE(arch)], arch, (cycle));
+#define ASSIGN_LSN_BLOCK(lsn,block,arch) \
+    INT_SET(((uint *)&(lsn))[LSN_FIELD_BLOCK(arch)], arch, (block));
+#define ASSIGN_ANY_LSN(lsn,cycle,block,arch)  \
+    { \
+	ASSIGN_LSN_CYCLE(lsn,cycle,arch); \
+	ASSIGN_LSN_BLOCK(lsn,block,arch); \
+    }
+#define ASSIGN_LSN(lsn,log,arch) \
+    ASSIGN_ANY_LSN(lsn,(log)->l_curr_cycle,(log)->l_curr_block,arch);
+
+#define XLOG_SET(f,b)		(((f) & (b)) == (b))
+
+#define GET_CYCLE(ptr, arch) \
+    (INT_GET(*(uint *)(ptr), arch) == XLOG_HEADER_MAGIC_NUM ? \
+	 INT_GET(*((uint *)(ptr)+1), arch) : \
+	 INT_GET(*(uint *)(ptr), arch) \
+    )
+
+#define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1)
+
+
+#ifdef __KERNEL__
+/*
+ * get client id from packed copy.
+ *
+ * this hack is here because the xlog_pack code copies four bytes
+ * of xlog_op_header containing the fields oh_clientid, oh_flags
+ * and oh_res2 into the packed copy.
+ *
+ * later on this four byte chunk is treated as an int and the
+ * client id is pulled out.
+ *
+ * this has endian issues, of course.
+ */
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define GET_CLIENT_ID(i,arch) \
+    ((i) & 0xff)
+#else
+#define GET_CLIENT_ID(i,arch) \
+    ((i) >> 24)
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XLOG_GRANT_SUB_SPACE)
+void xlog_grant_sub_space(struct log *log, int bytes, int type);
+#define XLOG_GRANT_SUB_SPACE(log,bytes,type)	\
+	xlog_grant_sub_space(log,bytes,type)
+#else
+#define XLOG_GRANT_SUB_SPACE(log,bytes,type)				\
+    {									\
+	if (type == 'w') {						\
+		(log)->l_grant_write_bytes -= (bytes);			\
+		if ((log)->l_grant_write_bytes < 0) {			\
+			(log)->l_grant_write_bytes += (log)->l_logsize; \
+			(log)->l_grant_write_cycle--;			\
+		}							\
+	} else {							\
+		(log)->l_grant_reserve_bytes -= (bytes);		\
+		if ((log)->l_grant_reserve_bytes < 0) {			\
+			(log)->l_grant_reserve_bytes += (log)->l_logsize;\
+			(log)->l_grant_reserve_cycle--;			\
+		}							\
+	 }								\
+    }
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XLOG_GRANT_ADD_SPACE)
+void xlog_grant_add_space(struct log *log, int bytes, int type);
+#define XLOG_GRANT_ADD_SPACE(log,bytes,type)	\
+	xlog_grant_add_space(log,bytes,type)
+#else
+#define XLOG_GRANT_ADD_SPACE(log,bytes,type)				\
+    {									\
+	if (type == 'w') {						\
+		(log)->l_grant_write_bytes += (bytes);			\
+		if ((log)->l_grant_write_bytes > (log)->l_logsize) {	\
+			(log)->l_grant_write_bytes -= (log)->l_logsize; \
+			(log)->l_grant_write_cycle++;			\
+		}							\
+	} else {							\
+		(log)->l_grant_reserve_bytes += (bytes);		\
+		if ((log)->l_grant_reserve_bytes > (log)->l_logsize) {	\
+			(log)->l_grant_reserve_bytes -= (log)->l_logsize;\
+			(log)->l_grant_reserve_cycle++;			\
+		}							\
+	 }								\
+    }
+#endif
+#define XLOG_INS_TICKETQ(q,tic)				\
+    {							\
+	if (q) {					\
+		(tic)->t_next	    = (q);		\
+		(tic)->t_prev	    = (q)->t_prev;	\
+		(q)->t_prev->t_next = (tic);		\
+		(q)->t_prev	    = (tic);		\
+	} else {					\
+		(tic)->t_prev = (tic)->t_next = (tic);	\
+		(q) = (tic);				\
+	}						\
+	(tic)->t_flags |= XLOG_TIC_IN_Q;		\
+    }
+#define XLOG_DEL_TICKETQ(q,tic)				\
+    {							\
+	if ((tic) == (tic)->t_next) {			\
+		(q) = NULL;				\
+	} else {					\
+		(q) = (tic)->t_next;			\
+		(tic)->t_next->t_prev = (tic)->t_prev;	\
+		(tic)->t_prev->t_next = (tic)->t_next;	\
+	}						\
+	(tic)->t_next = (tic)->t_prev = NULL;		\
+	(tic)->t_flags &= ~XLOG_TIC_IN_Q;		\
+    }
+
+
+#define GRANT_LOCK(log)		mutex_spinlock(&(log)->l_grant_lock)
+#define GRANT_UNLOCK(log, s)	mutex_spinunlock(&(log)->l_grant_lock, s)
+#define LOG_LOCK(log)		mutex_spinlock(&(log)->l_icloglock)
+#define LOG_UNLOCK(log, s)	mutex_spinunlock(&(log)->l_icloglock, s)
+
+#define xlog_panic(s)		{cmn_err(CE_PANIC, s); }
+#define xlog_exit(s)		{cmn_err(CE_PANIC, s); }
+#define xlog_warn(s)		{cmn_err(CE_WARN, s); }
+
+/*
+ * In core log state
+ */
+#define XLOG_STATE_ACTIVE    0x0001 /* Current IC log being written to */
+#define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */
+#define XLOG_STATE_SYNCING   0x0004 /* This IC log is syncing */
+#define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */
+#define XLOG_STATE_DO_CALLBACK \
+			     0x0010 /* Process callback functions */
+#define XLOG_STATE_CALLBACK  0x0020 /* Callback functions now */
+#define XLOG_STATE_DIRTY     0x0040 /* Dirty IC log, not ready for ACTIVE status*/
+#define XLOG_STATE_IOERROR   0x0080 /* IO error happened in sync'ing log */
+#define XLOG_STATE_ALL	     0x7FFF /* All possible valid flags */
+#define XLOG_STATE_NOTUSED   0x8000 /* This IC log not being used */
+#endif	/* __KERNEL__ */
+
+/*
+ * Flags to log operation header
+ *
+ * The first write of a new transaction will be preceded with a start
+ * record, XLOG_START_TRANS.  Once a transaction is committed, a commit
+ * record is written, XLOG_COMMIT_TRANS.  If a single region can not fit into
+ * the remainder of the current active in-core log, it is split up into
+ * multiple regions.  Each partial region will be marked with a
+ * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS.
+ *
+ */
+#define XLOG_START_TRANS	0x01	/* Start a new transaction */
+#define XLOG_COMMIT_TRANS	0x02	/* Commit this transaction */
+#define XLOG_CONTINUE_TRANS	0x04	/* Cont this trans into new region */
+#define XLOG_WAS_CONT_TRANS	0x08	/* Cont this trans into new region */
+#define XLOG_END_TRANS		0x10	/* End a continued transaction */
+#define XLOG_UNMOUNT_TRANS	0x20	/* Unmount a filesystem transaction */
+#define XLOG_SKIP_TRANS		(XLOG_COMMIT_TRANS | XLOG_CONTINUE_TRANS | \
+				 XLOG_WAS_CONT_TRANS | XLOG_END_TRANS | \
+				 XLOG_UNMOUNT_TRANS)
+
+#ifdef __KERNEL__
+/*
+ * Flags to log ticket
+ */
+#define XLOG_TIC_INITED		0x1	/* has been initialized */
+#define XLOG_TIC_PERM_RESERV	0x2	/* permanent reservation */
+#define XLOG_TIC_IN_Q		0x4
+#endif	/* __KERNEL__ */
+
+#define XLOG_UNMOUNT_TYPE	0x556e	/* Un for Unmount */
+
+/*
+ * Flags for log structure
+ */
+#define XLOG_CHKSUM_MISMATCH	0x1	/* used only during recovery */
+#define XLOG_ACTIVE_RECOVERY	0x2	/* in the middle of recovery */
+#define XLOG_RECOVERY_NEEDED	0x4	/* log was recovered */
+#define XLOG_IO_ERROR		0x8	/* log hit an I/O error, and being
+					   shutdown */
+typedef __uint32_t xlog_tid_t;
+
+
+#ifdef __KERNEL__
+/*
+ * Below are states for covering allocation transactions.
+ * By covering, we mean changing the h_tail_lsn in the last on-disk
+ * log write such that no allocation transactions will be re-done during
+ * recovery after a system crash. Recovery starts at the last on-disk
+ * log write.
+ *
+ * These states are used to insert dummy log entries to cover
+ * space allocation transactions which can undo non-transactional changes
+ * after a crash. Writes to a file with space
+ * already allocated do not result in any transactions. Allocations
+ * might include space beyond the EOF. So if we just push the EOF a
+ * little, the last transaction for the file could contain the wrong
+ * size. If there is no file system activity, after an allocation
+ * transaction, and the system crashes, the allocation transaction
+ * will get replayed and the file will be truncated. This could
+ * be hours/days/... after the allocation occurred.
+ *
+ * The fix for this is to do two dummy transactions when the
+ * system is idle. We need two dummy transaction because the h_tail_lsn
+ * in the log record header needs to point beyond the last possible
+ * non-dummy transaction. The first dummy changes the h_tail_lsn to
+ * the first transaction before the dummy. The second dummy causes
+ * h_tail_lsn to point to the first dummy. Recovery starts at h_tail_lsn.
+ *
+ * These dummy transactions get committed when everything
+ * is idle (after there has been some activity).
+ *
+ * There are 5 states used to control this.
+ *
+ *  IDLE -- no logging has been done on the file system or
+ *		we are done covering previous transactions.
+ *  NEED -- logging has occurred and we need a dummy transaction
+ *		when the log becomes idle.
+ *  DONE -- we were in the NEED state and have committed a dummy
+ *		transaction.
+ *  NEED2 -- we detected that a dummy transaction has gone to the
+ *		on disk log with no other transactions.
+ *  DONE2 -- we committed a dummy transaction when in the NEED2 state.
+ *
+ * There are two places where we switch states:
+ *
+ * 1.) In xfs_sync, when we detect an idle log and are in NEED or NEED2.
+ *	We commit the dummy transaction and switch to DONE or DONE2,
+ *	respectively. In all other states, we don't do anything.
+ *
+ * 2.) When we finish writing the on-disk log (xlog_state_clean_log).
+ *
+ *	No matter what state we are in, if this isn't the dummy
+ *	transaction going out, the next state is NEED.
+ *	So, if we aren't in the DONE or DONE2 states, the next state
+ *	is NEED. We can't be finishing a write of the dummy record
+ *	unless it was committed and the state switched to DONE or DONE2.
+ *
+ *	If we are in the DONE state and this was a write of the
+ *		dummy transaction, we move to NEED2.
+ *
+ *	If we are in the DONE2 state and this was a write of the
+ *		dummy transaction, we move to IDLE.
+ *
+ *
+ * Writing only one dummy transaction can get appended to
+ * one file space allocation. When this happens, the log recovery
+ * code replays the space allocation and a file could be truncated.
+ * This is why we have the NEED2 and DONE2 states before going idle.
+ */
+
+#define XLOG_STATE_COVER_IDLE	0
+#define XLOG_STATE_COVER_NEED	1
+#define XLOG_STATE_COVER_DONE	2
+#define XLOG_STATE_COVER_NEED2	3
+#define XLOG_STATE_COVER_DONE2	4
+
+#define XLOG_COVER_OPS		5
+
+typedef struct xlog_ticket {
+	sv_t		   t_sema;	 /* sleep on this semaphore	 :20 */
+	struct xlog_ticket *t_next;	 /*				 : 4 */
+	struct xlog_ticket *t_prev;	 /*				 : 4 */
+	xlog_tid_t	   t_tid;	 /* transaction identifier	 : 4 */
+	int		   t_curr_res;	 /* current reservation in bytes : 4 */
+	int		   t_unit_res;	 /* unit reservation in bytes	 : 4 */
+	__uint8_t	   t_ocnt;	 /* original count		 : 1 */
+	__uint8_t	   t_cnt;	 /* current count		 : 1 */
+	__uint8_t	   t_clientid;	 /* who does this belong to;	 : 1 */
+	__uint8_t	   t_flags;	 /* properties of reservation	 : 1 */
+} xlog_ticket_t;
+#endif
+
+
+typedef struct xlog_op_header {
+	xlog_tid_t oh_tid;	/* transaction id of operation	:  4 b */
+	int	   oh_len;	/* bytes in data region		:  4 b */
+	__uint8_t  oh_clientid; /* who sent me this		:  1 b */
+	__uint8_t  oh_flags;	/*				:  1 b */
+	ushort	   oh_res2;	/* 32 bit align			:  2 b */
+} xlog_op_header_t;
+
+
+/* valid values for h_fmt */
+#define XLOG_FMT_UNKNOWN  0
+#define XLOG_FMT_LINUX_LE 1
+#define XLOG_FMT_LINUX_BE 2
+#define XLOG_FMT_IRIX_BE  3
+
+/* our fmt */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define XLOG_FMT XLOG_FMT_LINUX_LE
+#else
+#if __BYTE_ORDER == __BIG_ENDIAN
+#define XLOG_FMT XLOG_FMT_LINUX_BE
+#else
+#error unknown byte order
+#endif
+#endif
+
+typedef struct xlog_rec_header {
+	uint	  h_magicno;	/* log record (LR) identifier		:  4 */
+	uint	  h_cycle;	/* write cycle of log			:  4 */
+	int	  h_version;	/* LR version				:  4 */
+	int	  h_len;	/* len in bytes; should be 64-bit aligned: 4 */
+	xfs_lsn_t h_lsn;	/* lsn of this LR			:  8 */
+	xfs_lsn_t h_tail_lsn;	/* lsn of 1st LR w/ buffers not committed: 8 */
+	uint	  h_chksum;	/* may not be used; non-zero if used	:  4 */
+	int	  h_prev_block; /* block number to previous LR		:  4 */
+	int	  h_num_logops; /* number of log operations in this LR	:  4 */
+	uint	  h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
+	/* new fields */
+	int	  h_fmt;	/* format of log record			:  4 */
+	uuid_t	  h_fs_uuid;	/* uuid of FS				: 16 */
+	int	  h_size;	/* iclog size				:  4 */
+} xlog_rec_header_t;
+
+typedef struct xlog_rec_ext_header {
+	uint	  xh_cycle;	/* write cycle of log			: 4 */
+	uint	  xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /*	: 256 */
+} xlog_rec_ext_header_t;
+#ifdef __KERNEL__
+/*
+ * - A log record header is 512 bytes.	There is plenty of room to grow the
+ *	xlog_rec_header_t into the reserved space.
+ * - ic_data follows, so a write to disk can start at the beginning of
+ *	the iclog.
+ * - ic_forcesema is used to implement synchronous forcing of the iclog to disk.
+ * - ic_next is the pointer to the next iclog in the ring.
+ * - ic_bp is a pointer to the buffer used to write this incore log to disk.
+ * - ic_log is a pointer back to the global log structure.
+ * - ic_callback is a linked list of callback function/argument pairs to be
+ *	called after an iclog finishes writing.
+ * - ic_size is the full size of the header plus data.
+ * - ic_offset is the current number of bytes written to in this iclog.
+ * - ic_refcnt is bumped when someone is writing to the log.
+ * - ic_state is the state of the iclog.
+ */
+typedef struct xlog_iclog_fields {
+	sv_t			ic_forcesema;
+	struct xlog_in_core	*ic_next;
+	struct xlog_in_core	*ic_prev;
+	struct xfs_buf		*ic_bp;
+	struct log		*ic_log;
+	xfs_log_callback_t	*ic_callback;
+	xfs_log_callback_t	**ic_callback_tail;
+#ifdef DEBUG
+	struct ktrace		*ic_trace;
+#endif
+	int			ic_size;
+	int			ic_offset;
+	int			ic_refcnt;
+	int			ic_roundoff;
+	int			ic_bwritecnt;
+	ushort_t		ic_state;
+	char			*ic_datap;	/* pointer to iclog data */
+	struct tq_struct	ic_write_sched;
+} xlog_iclog_fields_t;
+
+typedef struct xlog_in_core2 {
+	union {
+		xlog_rec_header_t hic_header;
+		xlog_rec_ext_header_t hic_xheader;
+		char		  hic_sector[XLOG_HEADER_SIZE];
+	} ic_h;
+} xlog_in_core_2_t;
+
+typedef struct xlog_in_core {
+	xlog_iclog_fields_t	hic_fields;
+	xlog_in_core_2_t	*hic_data;
+} xlog_in_core_t;
+
+/*
+ * Defines to save our code from this glop.
+ */
+#define ic_forcesema	hic_fields.ic_forcesema
+#define ic_write_sched	hic_fields.ic_write_sched
+#define ic_next		hic_fields.ic_next
+#define ic_prev		hic_fields.ic_prev
+#define ic_bp		hic_fields.ic_bp
+#define ic_log		hic_fields.ic_log
+#define ic_callback	hic_fields.ic_callback
+#define ic_callback_tail hic_fields.ic_callback_tail
+#define ic_trace	hic_fields.ic_trace
+#define ic_size		hic_fields.ic_size
+#define ic_offset	hic_fields.ic_offset
+#define ic_refcnt	hic_fields.ic_refcnt
+#define ic_roundoff	hic_fields.ic_roundoff
+#define ic_bwritecnt	hic_fields.ic_bwritecnt
+#define ic_state	hic_fields.ic_state
+#define ic_datap	hic_fields.ic_datap
+#define ic_header	hic_data->ic_h.hic_header
+
+/*
+ * The reservation head lsn is not made up of a cycle number and block number.
+ * Instead, it uses a cycle number and byte number.  Logs don't expect to
+ * overflow 31 bits worth of byte offset, so using a byte number will mean
+ * that round off problems won't occur when releasing partial reservations.
+ */
+typedef struct log {
+    /* The following block of fields are changed while holding icloglock */
+    sema_t		l_flushsema;	/* iclog flushing semaphore */
+    int			l_flushcnt;	/* # of procs waiting on this sema */
+    int			l_ticket_cnt;	/* free ticket count */
+    int			l_ticket_tcnt;	/* total ticket count */
+    int			l_covered_state;/* state of "covering disk log entries" */
+    xlog_ticket_t	*l_freelist;	/* free list of tickets */
+    xlog_ticket_t	*l_unmount_free;/* kmem_free these addresses */
+    xlog_ticket_t	*l_tail;	/* free list of tickets */
+    xlog_in_core_t	*l_iclog;	/* head log queue	*/
+    lock_t		l_icloglock;	/* grab to change iclog state */
+    xfs_lsn_t		l_tail_lsn;	/* lsn of 1st LR w/ unflush buffers */
+    xfs_lsn_t		l_last_sync_lsn;/* lsn of last LR on disk */
+    struct xfs_mount	*l_mp;		/* mount point */
+    struct xfs_buf	*l_xbuf;	/* extra buffer for log wrapping */
+    dev_t		l_dev;		/* dev_t of log */
+    xfs_daddr_t		l_logBBstart;	/* start block of log */
+    int			l_logsize;	/* size of log in bytes */
+    int			l_logBBsize;	/* size of log in 512 byte chunks */
+    int			l_roundoff;	/* round off error of all iclogs */
+    int			l_curr_cycle;	/* Cycle number of log writes */
+    int			l_prev_cycle;	/* Cycle # b4 last block increment */
+    int			l_curr_block;	/* current logical block of log */
+    int			l_prev_block;	/* previous logical block of log */
+    int			l_iclog_size;	 /* size of log in bytes */
+    int			l_iclog_size_log;/* log power size of log */
+    int			l_iclog_bufs;	 /* number of iclog buffers */
+
+    /* The following field are used for debugging; need to hold icloglock */
+    char		*l_iclog_bak[XLOG_MAX_ICLOGS];
+
+    /* The following block of fields are changed while holding grant_lock */
+    lock_t		l_grant_lock;		/* protects below fields */
+    xlog_ticket_t	*l_reserve_headq;	/* */
+    xlog_ticket_t	*l_write_headq;		/* */
+    int			l_grant_reserve_cycle;	/* */
+    int			l_grant_reserve_bytes;	/* */
+    int			l_grant_write_cycle;	/* */
+    int			l_grant_write_bytes;	/* */
+
+    /* The following fields don't need locking */
+#ifdef DEBUG
+    struct ktrace	*l_trace;
+    struct ktrace	*l_grant_trace;
+#endif
+    uint		l_flags;
+    uint		l_quotaoffs_flag;/* XFS_DQ_*, if QUOTAOFFs found */
+    struct xfs_buf_cancel **l_buf_cancel_table;
+    int			l_iclog_hsize;	/* size of iclog header */
+    int			l_iclog_heads;	/* number of iclog header sectors */
+} xlog_t;
+
+
+/* common routines */
+extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp,
+				      xlog_in_core_t *iclog);
+extern int	 xlog_find_head(xlog_t *log, xfs_daddr_t *head_blk);
+extern int	 xlog_find_tail(xlog_t	*log,
+				xfs_daddr_t *head_blk,
+				xfs_daddr_t *tail_blk,
+				int readonly);
+extern int	 xlog_print_find_oldest(xlog_t *log, xfs_daddr_t *last_blk);
+extern int	 xlog_recover(xlog_t *log, int readonly);
+extern int	 xlog_recover_finish(xlog_t *log, int mfsi_flags);
+extern void	 xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog);
+extern struct xfs_buf *xlog_get_bp(int,xfs_mount_t *);
+extern void	 xlog_put_bp(struct xfs_buf *);
+extern int	 xlog_bread(xlog_t *, xfs_daddr_t blkno, int bblks, struct xfs_buf *bp);
+extern void	 xlog_recover_process_iunlinks(xlog_t *log);
+
+#define XLOG_TRACE_GRAB_FLUSH  1
+#define XLOG_TRACE_REL_FLUSH   2
+#define XLOG_TRACE_SLEEP_FLUSH 3
+#define XLOG_TRACE_WAKE_FLUSH  4
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
new file mode 100644
index 000000000000..8d1676e1d157
--- /dev/null
+++ b/fs/xfs/xfs_log_recover.c
@@ -0,0 +1,3702 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <xfs_log_recover.h>
+
+STATIC int	xlog_find_zeroed(struct log *log, xfs_daddr_t *blk_no);
+
+STATIC int	xlog_clear_stale_blocks(xlog_t	*log, xfs_lsn_t tail_lsn);
+STATIC void	xlog_recover_insert_item_backq(xlog_recover_item_t **q,
+					       xlog_recover_item_t *item);
+
+#if defined(DEBUG)
+STATIC void	xlog_recover_check_summary(xlog_t *log);
+STATIC void	xlog_recover_check_ail(xfs_mount_t *mp, xfs_log_item_t *lip,
+				       int gen);
+#else
+#define xlog_recover_check_summary(log)
+#define xlog_recover_check_ail(mp, lip, gen)
+#endif /* DEBUG */
+
+
+xfs_buf_t *
+xlog_get_bp(int num_bblks,xfs_mount_t *mp)
+{
+	xfs_buf_t   *bp;
+
+	ASSERT(num_bblks > 0);
+
+	bp = XFS_ngetrbuf(BBTOB(num_bblks),mp);
+	return bp;
+}	/* xlog_get_bp */
+
+
+void
+xlog_put_bp(xfs_buf_t *bp)
+{
+	XFS_nfreerbuf(bp);
+}	/* xlog_put_bp */
+
+
+/*
+ * nbblks should be uint, but oh well.	Just want to catch that 32-bit length.
+ */
+int
+xlog_bread(xlog_t	*log,
+	   xfs_daddr_t	blk_no,
+	   int		nbblks,
+	   xfs_buf_t	*bp)
+{
+	int error;
+
+	ASSERT(log);
+	ASSERT(nbblks > 0);
+	ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
+	ASSERT(bp);
+
+	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
+	XFS_BUF_READ(bp);
+	XFS_BUF_BUSY(bp);
+	XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
+	XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
+
+	xfsbdstrat(log->l_mp, bp);
+	if ((error = xfs_iowait(bp))) {
+		xfs_ioerror_alert("xlog_bread", log->l_mp,
+				  bp, XFS_BUF_ADDR(bp));
+		return (error);
+	}
+	return error;
+}	/* xlog_bread */
+
+
+/*
+ * Write out the buffer at the given block for the given number of blocks.
+ * The buffer is kept locked across the write and is returned locked.
+ * This can only be used for synchronous log writes.
+ */
+int
+xlog_bwrite(
+	xlog_t	*log,
+	int	blk_no,
+	int	nbblks,
+	xfs_buf_t	*bp)
+{
+	int	error;
+
+	ASSERT(nbblks > 0);
+	ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
+
+	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
+	XFS_BUF_ZEROFLAGS(bp);
+	XFS_BUF_BUSY(bp);
+	XFS_BUF_HOLD(bp);
+	XFS_BUF_PSEMA(bp, PRIBIO);
+	XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
+	XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
+
+	if ((error = xfs_bwrite(log->l_mp, bp)))
+		xfs_ioerror_alert("xlog_bwrite", log->l_mp,
+				  bp, XFS_BUF_ADDR(bp));
+
+	return (error);
+}	/* xlog_bwrite */
+
+#ifdef DEBUG
+/*
+ * check log record header for recovery
+ */
+
+static void
+xlog_header_check_dump(xfs_mount_t *mp, xlog_rec_header_t *head)
+{
+    int b;
+
+    printk("xlog_header_check_dump:\n	 SB : uuid = ");
+    for (b=0;b<16;b++) printk("%02x",((unsigned char *)&mp->m_sb.sb_uuid)[b]);
+    printk(", fmt = %d\n",XLOG_FMT);
+    printk("	log: uuid = ");
+    for (b=0;b<16;b++) printk("%02x",((unsigned char *)&head->h_fs_uuid)[b]);
+    printk(", fmt = %d\n", INT_GET(head->h_fmt, ARCH_CONVERT));
+}
+#endif
+
+/*
+ * check log record header for recovery
+ */
+
+STATIC int
+xlog_header_check_recover(xfs_mount_t *mp, xlog_rec_header_t *head)
+{
+    ASSERT(INT_GET(head->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
+
+    /*
+     * IRIX doesn't write the h_fmt field and leaves it zeroed
+     * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
+     * a dirty log created in IRIX.
+     */
+
+    if (INT_GET(head->h_fmt, ARCH_CONVERT) != XLOG_FMT) {
+	xlog_warn("XFS: dirty log written in incompatible format - can't recover");
+#ifdef DEBUG
+	xlog_header_check_dump(mp, head);
+#endif
+	return XFS_ERROR(EFSCORRUPTED);
+    } else if (!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid)) {
+	xlog_warn("XFS: dirty log entry has mismatched uuid - can't recover");
+#ifdef DEBUG
+	xlog_header_check_dump(mp, head);
+#endif
+	return XFS_ERROR(EFSCORRUPTED);
+    }
+
+    return 0;
+}
+
+/*
+ * read the head block of the log and check the header
+ */
+
+STATIC int
+xlog_header_check_mount(xfs_mount_t *mp, xlog_rec_header_t *head)
+{
+    ASSERT(INT_GET(head->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
+
+    if (uuid_is_nil(&head->h_fs_uuid)) {
+
+	/*
+	 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
+	 * h_fs_uuid is nil, we assume this log was last mounted
+	 * by IRIX and continue.
+	 */
+
+	xlog_warn("XFS: nil uuid in log - IRIX style log");
+
+    } else if (!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid)) {
+	xlog_warn("XFS: log has mismatched uuid - can't recover");
+#ifdef DEBUG
+	xlog_header_check_dump(mp, head);
+#endif
+	return XFS_ERROR(EFSCORRUPTED);
+    }
+
+    return 0;
+}
+
+STATIC void
+xlog_recover_iodone(
+	struct xfs_buf	*bp)
+{
+	xfs_mount_t	*mp;
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *));
+
+	if (XFS_BUF_GETERROR(bp)) {
+		/*
+		 * We're not going to bother about retrying
+		 * this during recovery. One strike!
+		 */
+		mp = XFS_BUF_FSPRIVATE(bp, xfs_mount_t *);
+		xfs_ioerror_alert("xlog_recover_iodone",
+				  mp, bp, XFS_BUF_ADDR(bp));
+		xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
+	}
+	XFS_BUF_SET_FSPRIVATE(bp, NULL);
+	XFS_BUF_CLR_IODONE_FUNC(bp);
+	xfs_biodone(bp);
+}
+
+/*
+ * This routine finds (to an approximation) the first block in the physical
+ * log which contains the given cycle.	It uses a binary search algorithm.
+ * Note that the algorithm can not be perfect because the disk will not
+ * necessarily be perfect.
+ */
+int
+xlog_find_cycle_start(xlog_t	*log,
+		      xfs_buf_t *bp,
+		      xfs_daddr_t	first_blk,
+		      xfs_daddr_t	*last_blk,
+		      uint	cycle)
+{
+	xfs_daddr_t mid_blk;
+	uint	mid_cycle;
+	int	error;
+
+	mid_blk = BLK_AVG(first_blk, *last_blk);
+	while (mid_blk != first_blk && mid_blk != *last_blk) {
+		if ((error = xlog_bread(log, mid_blk, 1, bp)))
+			return error;
+		mid_cycle = GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT);
+		if (mid_cycle == cycle) {
+			*last_blk = mid_blk;
+			/* last_half_cycle == mid_cycle */
+		} else {
+			first_blk = mid_blk;
+			/* first_half_cycle == mid_cycle */
+		}
+		mid_blk = BLK_AVG(first_blk, *last_blk);
+	}
+	ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) ||
+	       (mid_blk == *last_blk && mid_blk-1 == first_blk));
+
+	return 0;
+}	/* xlog_find_cycle_start */
+
+
+/*
+ * Check that the range of blocks does not contain the cycle number
+ * given.  The scan needs to occur from front to back and the ptr into the
+ * region must be updated since a later routine will need to perform another
+ * test.  If the region is completely good, we end up returning the same
+ * last block number.
+ *
+ * Set blkno to -1 if we encounter no errors.  This is an invalid block number
+ * since we don't ever expect logs to get this large.
+ */
+
+STATIC int
+xlog_find_verify_cycle( xlog_t		*log,
+			xfs_daddr_t	start_blk,
+			int		nbblks,
+			uint		stop_on_cycle_no,
+			xfs_daddr_t	*new_blk)
+{
+	xfs_daddr_t		i, j;
+	uint			cycle;
+	xfs_buf_t		*bp;
+	char			*buf	    = NULL;
+	int			error	    = 0;
+	xfs_daddr_t		bufblks;
+
+	bufblks = 1 << ffs(nbblks);
+
+	while (!(bp = xlog_get_bp(bufblks, log->l_mp))) {
+		/* can't get enough memory to do everything in one big buffer */
+		bufblks >>= 1;
+		if (!bufblks)
+			return ENOMEM;
+	}
+
+
+	for (i = start_blk; i < start_blk + nbblks; i += bufblks)  {
+		int bcount = min(bufblks, (start_blk + nbblks - i));
+
+		if ((error = xlog_bread(log, i, bcount, bp)))
+			goto out;
+
+		buf = XFS_BUF_PTR(bp);
+		for (j = 0; j < bcount; j++) {
+			cycle = GET_CYCLE(buf, ARCH_CONVERT);
+			if (cycle == stop_on_cycle_no) {
+				*new_blk = i+j;
+				goto out;
+			}
+
+			buf += BBSIZE;
+		}
+	}
+
+	*new_blk = -1;
+
+out:
+	xlog_put_bp(bp);
+
+	return error;
+}	/* xlog_find_verify_cycle */
+
+
+/*
+ * Potentially backup over partial log record write.
+ *
+ * In the typical case, last_blk is the number of the block directly after
+ * a good log record.  Therefore, we subtract one to get the block number
+ * of the last block in the given buffer.  extra_bblks contains the number
+ * of blocks we would have read on a previous read.  This happens when the
+ * last log record is split over the end of the physical log.
+ *
+ * extra_bblks is the number of blocks potentially verified on a previous
+ * call to this routine.
+ */
+
+STATIC int
+xlog_find_verify_log_record(xlog_t	*log,
+			    xfs_daddr_t start_blk,
+			    xfs_daddr_t *last_blk,
+			    int		extra_bblks)
+{
+    xfs_daddr_t		i;
+    xfs_buf_t		*bp;
+    char		*buf	    = NULL;
+    xlog_rec_header_t	*head	    = NULL;
+    int			error	    = 0;
+    int			smallmem    = 0;
+    int			num_blks    = *last_blk - start_blk;
+    int			xhdrs;
+
+    ASSERT(start_blk != 0 || *last_blk != start_blk);
+
+    if (!(bp = xlog_get_bp(num_blks, log->l_mp))) {
+	if (!(bp = xlog_get_bp(1, log->l_mp)))
+	    return ENOMEM;
+	smallmem = 1;
+	buf = XFS_BUF_PTR(bp);
+    } else {
+	if ((error = xlog_bread(log, start_blk, num_blks, bp)))
+	    goto out;
+	buf = XFS_BUF_PTR(bp) + (num_blks - 1) * BBSIZE;
+    }
+
+
+    for (i=(*last_blk)-1; i>=0; i--) {
+	if (i < start_blk) {
+	    /* legal log record not found */
+	    xlog_warn("XFS: Log inconsistent (didn't find previous header)");
+	    ASSERT(0);
+	    error = XFS_ERROR(EIO);
+	    goto out;
+	}
+
+	if (smallmem && (error = xlog_bread(log, i, 1, bp)))
+	    goto out;
+	head = (xlog_rec_header_t*)buf;
+
+	if (INT_GET(head->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM)
+	    break;
+
+	if (!smallmem)
+	    buf -= BBSIZE;
+    }
+
+    /*
+     * We hit the beginning of the physical log & still no header.  Return
+     * to caller.  If caller can handle a return of -1, then this routine
+     * will be called again for the end of the physical log.
+     */
+    if (i == -1) {
+	error = -1;
+	goto out;
+    }
+
+    /* we have the final block of the good log (the first block
+     * of the log record _before_ the head. So we check the uuid.
+     */
+
+    if ((error = xlog_header_check_mount(log->l_mp, head)))
+	goto out;
+
+    /*
+     * We may have found a log record header before we expected one.
+     * last_blk will be the 1st block # with a given cycle #.  We may end
+     * up reading an entire log record.	 In this case, we don't want to
+     * reset last_blk.	Only when last_blk points in the middle of a log
+     * record do we update last_blk.
+     */
+    if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+	int h_size = INT_GET(head->h_size, ARCH_CONVERT);
+	xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
+	if (h_size % XLOG_HEADER_CYCLE_SIZE)
+		xhdrs++;
+    } else {
+	xhdrs = 1;
+    }
+
+    if (*last_blk - i + extra_bblks
+		!= BTOBB(INT_GET(head->h_len, ARCH_CONVERT))+xhdrs)
+	    *last_blk = i;
+
+out:
+    xlog_put_bp(bp);
+
+    return error;
+}	/* xlog_find_verify_log_record */
+
+/*
+ * Head is defined to be the point of the log where the next log write
+ * write could go.  This means that incomplete LR writes at the end are
+ * eliminated when calculating the head.  We aren't guaranteed that previous
+ * LR have complete transactions.  We only know that a cycle number of
+ * current cycle number -1 won't be present in the log if we start writing
+ * from our current block number.
+ *
+ * last_blk contains the block number of the first block with a given
+ * cycle number.
+ *
+ * Also called from xfs_log_print.c
+ *
+ * Return: zero if normal, non-zero if error.
+ */
+int
+xlog_find_head(xlog_t  *log,
+	       xfs_daddr_t *return_head_blk)
+{
+    xfs_buf_t	*bp;
+    xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
+    int	    num_scan_bblks;
+    uint    first_half_cycle, last_half_cycle;
+    uint    stop_on_cycle;
+    int	    error, log_bbnum = log->l_logBBsize;
+
+    /* Is the end of the log device zeroed? */
+    if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
+	*return_head_blk = first_blk;
+
+	/* is the whole lot zeroed? */
+	if (!first_blk) {
+	    /* Linux XFS shouldn't generate totally zeroed logs -
+	     * mkfs etc write a dummy unmount record to a fresh
+	     * log so we can store the uuid in there
+	     */
+	    xlog_warn("XFS: totally zeroed log\n");
+	}
+
+	return 0;
+    } else if (error) {
+	xlog_warn("XFS: empty log check failed");
+	return error;
+    }
+
+    first_blk = 0;				/* get cycle # of 1st block */
+    bp = xlog_get_bp(1,log->l_mp);
+    if (!bp)
+	return ENOMEM;
+    if ((error = xlog_bread(log, 0, 1, bp)))
+	goto bp_err;
+    first_half_cycle = GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT);
+
+    last_blk = head_blk = log_bbnum-1;		/* get cycle # of last block */
+    if ((error = xlog_bread(log, last_blk, 1, bp)))
+	goto bp_err;
+    last_half_cycle = GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT);
+    ASSERT(last_half_cycle != 0);
+
+    /*
+     * If the 1st half cycle number is equal to the last half cycle number,
+     * then the entire log is stamped with the same cycle number.  In this
+     * case, head_blk can't be set to zero (which makes sense).	 The below
+     * math doesn't work out properly with head_blk equal to zero.  Instead,
+     * we set it to log_bbnum which is an illegal block number, but this
+     * value makes the math correct.  If head_blk doesn't changed through
+     * all the tests below, *head_blk is set to zero at the very end rather
+     * than log_bbnum.	In a sense, log_bbnum and zero are the same block
+     * in a circular file.
+     */
+    if (first_half_cycle == last_half_cycle) {
+	/*
+	 * In this case we believe that the entire log should have cycle
+	 * number last_half_cycle.  We need to scan backwards from the
+	 * end verifying that there are no holes still containing
+	 * last_half_cycle - 1.	 If we find such a hole, then the start
+	 * of that hole will be the new head.  The simple case looks like
+	 *	  x | x ... | x - 1 | x
+	 * Another case that fits this picture would be
+	 *	  x | x + 1 | x ... | x
+	 * In this case the head really is somwhere at the end of the
+	 * log, as one of the latest writes at the beginning was incomplete.
+	 * One more case is
+	 *	  x | x + 1 | x ... | x - 1 | x
+	 * This is really the combination of the above two cases, and the
+	 * head has to end up at the start of the x-1 hole at the end of
+	 * the log.
+	 *
+	 * In the 256k log case, we will read from the beginning to the
+	 * end of the log and search for cycle numbers equal to x-1.  We
+	 * don't worry about the x+1 blocks that we encounter, because
+	 * we know that they cannot be the head since the log started with
+	 * x.
+	 */
+	head_blk = log_bbnum;
+	stop_on_cycle = last_half_cycle - 1;
+    } else {
+	/*
+	 * In this case we want to find the first block with cycle number
+	 * matching last_half_cycle.  We expect the log to be some
+	 * variation on
+	 *	  x + 1 ... | x ...
+	 * The first block with cycle number x (last_half_cycle) will be
+	 * where the new head belongs.	First we do a binary search for
+	 * the first occurrence of last_half_cycle.  The binary search
+	 * may not be totally accurate, so then we scan back from there
+	 * looking for occurrences of last_half_cycle before us.  If
+	 * that backwards scan wraps around the beginning of the log,
+	 * then we look for occurrences of last_half_cycle - 1 at the
+	 * end of the log.  The cases we're looking for look like
+	 *	  x + 1 ... | x | x + 1 | x ...
+	 *				 ^ binary search stopped here
+	 * or
+	 *	  x + 1 ... | x ... | x - 1 | x
+	 *	  <---------> less than scan distance
+	 */
+	stop_on_cycle = last_half_cycle;
+	if ((error = xlog_find_cycle_start(log, bp, first_blk,
+					  &head_blk, last_half_cycle)))
+	    goto bp_err;
+    }
+
+    /*
+     * Now validate the answer.	 Scan back some number of maximum possible
+     * blocks and make sure each one has the expected cycle number.  The
+     * maximum is determined by the total possible amount of buffering
+     * in the in-core log.  The following number can be made tighter if
+     * we actually look at the block size of the filesystem.
+     */
+    num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
+    if (head_blk >= num_scan_bblks) {
+	/*
+	 * We are guaranteed that the entire check can be performed
+	 * in one buffer.
+	 */
+	start_blk = head_blk - num_scan_bblks;
+	if ((error = xlog_find_verify_cycle(log, start_blk, num_scan_bblks,
+					 stop_on_cycle, &new_blk)))
+	    goto bp_err;
+	if (new_blk != -1)
+	    head_blk = new_blk;
+    } else {			/* need to read 2 parts of log */
+	/*
+	 * We are going to scan backwards in the log in two parts.  First
+	 * we scan the physical end of the log.	 In this part of the log,
+	 * we are looking for blocks with cycle number last_half_cycle - 1.
+	 * If we find one, then we know that the log starts there, as we've
+	 * found a hole that didn't get written in going around the end
+	 * of the physical log.	 The simple case for this is
+	 *	  x + 1 ... | x ... | x - 1 | x
+	 *	  <---------> less than scan distance
+	 * If all of the blocks at the end of the log have cycle number
+	 * last_half_cycle, then we check the blocks at the start of the
+	 * log looking for occurrences of last_half_cycle.  If we find one,
+	 * then our current estimate for the location of the first
+	 * occurrence of last_half_cycle is wrong and we move back to the
+	 * hole we've found.  This case looks like
+	 *	  x + 1 ... | x | x + 1 | x ...
+	 *				 ^ binary search stopped here
+	 * Another case we need to handle that only occurs in 256k logs is
+	 *	  x + 1 ... | x ... | x+1 | x ...
+	 *		     ^ binary search stops here
+	 * In a 256k log, the scan at the end of the log will see the x+1
+	 * blocks.  We need to skip past those since that is certainly not
+	 * the head of the log.	 By searching for last_half_cycle-1 we
+	 * accomplish that.
+	 */
+	start_blk = log_bbnum - num_scan_bblks + head_blk;
+	ASSERT(head_blk <= INT_MAX && (xfs_daddr_t) num_scan_bblks-head_blk >= 0);
+	if ((error = xlog_find_verify_cycle(log, start_blk,
+			num_scan_bblks-(int)head_blk, (stop_on_cycle - 1),
+			&new_blk)))
+		goto bp_err;
+	if (new_blk != -1) {
+	    head_blk = new_blk;
+	    goto bad_blk;
+	}
+
+	/*
+	 * Scan beginning of log now.  The last part of the physical log
+	 * is good.  This scan needs to verify that it doesn't find the
+	 * last_half_cycle.
+	 */
+	start_blk = 0;
+	ASSERT(head_blk <= INT_MAX);
+	if ((error = xlog_find_verify_cycle(log, start_blk, (int) head_blk,
+					 stop_on_cycle, &new_blk)))
+	    goto bp_err;
+	if (new_blk != -1)
+	    head_blk = new_blk;
+    }
+
+bad_blk:
+    /*
+     * Now we need to make sure head_blk is not pointing to a block in
+     * the middle of a log record.
+     */
+    num_scan_bblks = BTOBB(XLOG_MAX_RECORD_BSIZE);
+    if (head_blk >= num_scan_bblks) {
+	start_blk = head_blk - num_scan_bblks;	/* don't read head_blk */
+
+	/* start ptr at last block ptr before head_blk */
+	if ((error = xlog_find_verify_log_record(log,
+						 start_blk,
+						 &head_blk,
+						 0)) == -1) {
+	    error = XFS_ERROR(EIO);
+	    goto bp_err;
+	} else if (error)
+	    goto bp_err;
+    } else {
+	start_blk = 0;
+	ASSERT(head_blk <= INT_MAX);
+	if ((error = xlog_find_verify_log_record(log,
+						 start_blk,
+						 &head_blk,
+						 0)) == -1) {
+	    /* We hit the beginning of the log during our search */
+	    start_blk = log_bbnum - num_scan_bblks + head_blk;
+	    new_blk = log_bbnum;
+	    ASSERT(start_blk <= INT_MAX && (xfs_daddr_t) log_bbnum-start_blk >= 0);
+	    ASSERT(head_blk <= INT_MAX);
+	    if ((error = xlog_find_verify_log_record(log,
+						     start_blk,
+						     &new_blk,
+						     (int)head_blk)) == -1) {
+		error = XFS_ERROR(EIO);
+		goto bp_err;
+	    } else if (error)
+		goto bp_err;
+	    if (new_blk != log_bbnum)
+		head_blk = new_blk;
+	} else if (error)
+	    goto bp_err;
+    }
+
+    xlog_put_bp(bp);
+    if (head_blk == log_bbnum)
+	    *return_head_blk = 0;
+    else
+	    *return_head_blk = head_blk;
+    /*
+     * When returning here, we have a good block number.  Bad block
+     * means that during a previous crash, we didn't have a clean break
+     * from cycle number N to cycle number N-1.	 In this case, we need
+     * to find the first block with cycle number N-1.
+     */
+    return 0;
+
+bp_err:
+	xlog_put_bp(bp);
+
+	if (error)
+	    xlog_warn("XFS: failed to find log head");
+
+	return error;
+}	/* xlog_find_head */
+
+/*
+ * Find the sync block number or the tail of the log.
+ *
+ * This will be the block number of the last record to have its
+ * associated buffers synced to disk.  Every log record header has
+ * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
+ * to get a sync block number.	The only concern is to figure out which
+ * log record header to believe.
+ *
+ * The following algorithm uses the log record header with the largest
+ * lsn.	 The entire log record does not need to be valid.  We only care
+ * that the header is valid.
+ *
+ * We could speed up search by using current head_blk buffer, but it is not
+ * available.
+ */
+int
+xlog_find_tail(xlog_t  *log,
+	       xfs_daddr_t *head_blk,
+	       xfs_daddr_t *tail_blk,
+	       int readonly)
+{
+	xlog_rec_header_t	*rhead;
+	xlog_op_header_t	*op_head;
+	xfs_buf_t		*bp;
+	int			error, i, found;
+	xfs_daddr_t		umount_data_blk;
+	xfs_daddr_t		after_umount_blk;
+	xfs_lsn_t		tail_lsn;
+	int			hblks;
+
+	found = 0;
+
+	/*
+	 * Find previous log record
+	 */
+	if ((error = xlog_find_head(log, head_blk)))
+		return error;
+
+	bp = xlog_get_bp(1,log->l_mp);
+	if (!bp)
+		return ENOMEM;
+	if (*head_blk == 0) {				/* special case */
+		if ((error = xlog_bread(log, 0, 1, bp)))
+			goto bread_err;
+		if (GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT) == 0) {
+			*tail_blk = 0;
+			/* leave all other log inited values alone */
+			goto exit;
+		}
+	}
+
+	/*
+	 * Search backwards looking for log record header block
+	 */
+	ASSERT(*head_blk < INT_MAX);
+	for (i = (int)(*head_blk) - 1; i >= 0; i--) {
+		if ((error = xlog_bread(log, i, 1, bp)))
+			goto bread_err;
+		if (XLOG_HEADER_MAGIC_NUM ==
+		    INT_GET(*(uint *)(XFS_BUF_PTR(bp)), ARCH_CONVERT)) {
+			found = 1;
+			break;
+		}
+	}
+	/*
+	 * If we haven't found the log record header block, start looking
+	 * again from the end of the physical log.  XXXmiken: There should be
+	 * a check here to make sure we didn't search more than N blocks in
+	 * the previous code.
+	 */
+	if (!found) {
+		for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
+			if ((error = xlog_bread(log, i, 1, bp)))
+				goto bread_err;
+			if (XLOG_HEADER_MAGIC_NUM ==
+			    INT_GET(*(uint*)(XFS_BUF_PTR(bp)), ARCH_CONVERT)) {
+				found = 2;
+				break;
+			}
+		}
+	}
+	if (!found) {
+		xlog_warn("XFS: xlog_find_tail: couldn't find sync record");
+		ASSERT(0);
+		return XFS_ERROR(EIO);
+	}
+
+	/* find blk_no of tail of log */
+	rhead = (xlog_rec_header_t *)XFS_BUF_PTR(bp);
+	*tail_blk = BLOCK_LSN(rhead->h_tail_lsn, ARCH_CONVERT);
+
+	/*
+	 * Reset log values according to the state of the log when we
+	 * crashed.  In the case where head_blk == 0, we bump curr_cycle
+	 * one because the next write starts a new cycle rather than
+	 * continuing the cycle of the last good log record.  At this
+	 * point we have guaranteed that all partial log records have been
+	 * accounted for.  Therefore, we know that the last good log record
+	 * written was complete and ended exactly on the end boundary
+	 * of the physical log.
+	 */
+	log->l_prev_block = i;
+	log->l_curr_block = (int)*head_blk;
+	log->l_curr_cycle = INT_GET(rhead->h_cycle, ARCH_CONVERT);
+	if (found == 2)
+		log->l_curr_cycle++;
+	log->l_tail_lsn = INT_GET(rhead->h_tail_lsn, ARCH_CONVERT);
+	log->l_last_sync_lsn = INT_GET(rhead->h_lsn, ARCH_CONVERT);
+	log->l_grant_reserve_cycle = log->l_curr_cycle;
+	log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
+	log->l_grant_write_cycle = log->l_curr_cycle;
+	log->l_grant_write_bytes = BBTOB(log->l_curr_block);
+
+	/*
+	 * Look for unmount record.  If we find it, then we know there
+	 * was a clean unmount.	 Since 'i' could be the last block in
+	 * the physical log, we convert to a log block before comparing
+	 * to the head_blk.
+	 *
+	 * Save the current tail lsn to use to pass to
+	 * xlog_clear_stale_blocks() below.  We won't want to clear the
+	 * unmount record if there is one, so we pass the lsn of the
+	 * unmount record rather than the block after it.
+	 */
+	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+		int	h_size = INT_GET(rhead->h_size, ARCH_CONVERT);
+		int	h_version = INT_GET(rhead->h_version, ARCH_CONVERT);
+
+		if ((h_version & XLOG_VERSION_2) &&
+		    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
+			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
+			if (h_size % XLOG_HEADER_CYCLE_SIZE)
+				hblks++;
+		} else {
+			hblks = 1;
+		}
+	} else {
+		hblks = 1;
+	}
+	after_umount_blk = (i + hblks + (int)
+		BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT))) % log->l_logBBsize;
+	tail_lsn = log->l_tail_lsn;
+	if (*head_blk == after_umount_blk &&
+	    INT_GET(rhead->h_num_logops, ARCH_CONVERT) == 1) {
+		umount_data_blk = (i + hblks) % log->l_logBBsize;
+		if ((error = xlog_bread(log, umount_data_blk, 1, bp))) {
+			goto bread_err;
+		}
+		op_head = (xlog_op_header_t *)XFS_BUF_PTR(bp);
+		if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
+			/*
+			 * Set tail and last sync so that newly written
+			 * log records will point recovery to after the
+			 * current unmount record.
+			 */
+			ASSIGN_ANY_LSN(log->l_tail_lsn, log->l_curr_cycle,
+					after_umount_blk, ARCH_NOCONVERT);
+			ASSIGN_ANY_LSN(log->l_last_sync_lsn, log->l_curr_cycle,
+					after_umount_blk, ARCH_NOCONVERT);
+			*tail_blk = after_umount_blk;
+		}
+	}
+
+#ifdef __KERNEL__
+	/*
+	 * Make sure that there are no blocks in front of the head
+	 * with the same cycle number as the head.  This can happen
+	 * because we allow multiple outstanding log writes concurrently,
+	 * and the later writes might make it out before earlier ones.
+	 *
+	 * We use the lsn from before modifying it so that we'll never
+	 * overwrite the unmount record after a clean unmount.
+	 *
+	 * Do this only if we are going to recover the filesystem
+	 */
+	if (!readonly)
+		error = xlog_clear_stale_blocks(log, tail_lsn);
+#endif
+
+bread_err:
+exit:
+	xlog_put_bp(bp);
+
+	if (error)
+		xlog_warn("XFS: failed to locate log tail");
+
+	return error;
+}	/* xlog_find_tail */
+
+
+/*
+ * Is the log zeroed at all?
+ *
+ * The last binary search should be changed to perform an X block read
+ * once X becomes small enough.	 You can then search linearly through
+ * the X blocks.  This will cut down on the number of reads we need to do.
+ *
+ * If the log is partially zeroed, this routine will pass back the blkno
+ * of the first block with cycle number 0.  It won't have a complete LR
+ * preceding it.
+ *
+ * Return:
+ *	0  => the log is completely written to
+ *	-1 => use *blk_no as the first block of the log
+ *	>0 => error has occurred
+ */
+int
+xlog_find_zeroed(struct log	*log,
+		 xfs_daddr_t	*blk_no)
+{
+	xfs_buf_t	*bp;
+	uint		first_cycle, last_cycle;
+	xfs_daddr_t	new_blk, last_blk, start_blk;
+	xfs_daddr_t	num_scan_bblks;
+	int		error, log_bbnum = log->l_logBBsize;
+
+	error = 0;
+	/* check totally zeroed log */
+	bp = xlog_get_bp(1,log->l_mp);
+	if (!bp)
+		return ENOMEM;
+	if ((error = xlog_bread(log, 0, 1, bp)))
+		goto bp_err;
+	first_cycle = GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT);
+	if (first_cycle == 0) {		/* completely zeroed log */
+		*blk_no = 0;
+		xlog_put_bp(bp);
+		return -1;
+	}
+
+	/* check partially zeroed log */
+	if ((error = xlog_bread(log, log_bbnum-1, 1, bp)))
+		goto bp_err;
+	last_cycle = GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT);
+	if (last_cycle != 0) {		/* log completely written to */
+		xlog_put_bp(bp);
+		return 0;
+	} else if (first_cycle != 1) {
+		/*
+		 * If the cycle of the last block is zero, the cycle of
+		 * the first block must be 1. If it's not, maybe we're
+		 * not looking at a log... Bail out.
+		 */
+		xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)");
+		return XFS_ERROR(EINVAL);
+	}
+
+	/* we have a partially zeroed log */
+	last_blk = log_bbnum-1;
+	if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
+		goto bp_err;
+
+	/*
+	 * Validate the answer.	 Because there is no way to guarantee that
+	 * the entire log is made up of log records which are the same size,
+	 * we scan over the defined maximum blocks.  At this point, the maximum
+	 * is not chosen to mean anything special.   XXXmiken
+	 */
+	num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
+	ASSERT(num_scan_bblks <= INT_MAX);
+
+	if (last_blk < num_scan_bblks)
+		num_scan_bblks = last_blk;
+	start_blk = last_blk - num_scan_bblks;
+
+	/*
+	 * We search for any instances of cycle number 0 that occur before
+	 * our current estimate of the head.  What we're trying to detect is
+	 *	  1 ... | 0 | 1 | 0...
+	 *			 ^ binary search ends here
+	 */
+	if ((error = xlog_find_verify_cycle(log, start_blk,
+					 (int)num_scan_bblks, 0, &new_blk)))
+		goto bp_err;
+	if (new_blk != -1)
+		last_blk = new_blk;
+
+	/*
+	 * Potentially backup over partial log record write.  We don't need
+	 * to search the end of the log because we know it is zero.
+	 */
+	if ((error = xlog_find_verify_log_record(log, start_blk,
+				&last_blk, 0)) == -1) {
+	    error = XFS_ERROR(EIO);
+	    goto bp_err;
+	} else if (error)
+	    goto bp_err;
+
+	*blk_no = last_blk;
+bp_err:
+	xlog_put_bp(bp);
+	if (error)
+		return error;
+	return -1;
+}	/* xlog_find_zeroed */
+
+/*
+ * This is simply a subroutine used by xlog_clear_stale_blocks() below
+ * to initialize a buffer full of empty log record headers and write
+ * them into the log.
+ */
+STATIC int
+xlog_write_log_records(
+	xlog_t	*log,
+	int	cycle,
+	int	start_block,
+	int	blocks,
+	int	tail_cycle,
+	int	tail_block)
+{
+	xlog_rec_header_t	*recp;
+	int			i, j;
+	int			end_block = start_block + blocks;
+	int			error	    = 0;
+	xfs_buf_t		*bp;
+	char			*buf;
+	int			bufblks;
+
+	bufblks = 1 << ffs(blocks);
+	while (!(bp = xlog_get_bp(bufblks, log->l_mp))) {
+		bufblks >>= 1;
+		if (!bufblks)
+			return ENOMEM;
+	}
+
+	buf = XFS_BUF_PTR(bp);
+	recp = (xlog_rec_header_t*)buf;
+
+	memset(buf, 0, BBSIZE);
+	INT_SET(recp->h_magicno, ARCH_CONVERT, XLOG_HEADER_MAGIC_NUM);
+	INT_SET(recp->h_cycle, ARCH_CONVERT, cycle);
+	INT_SET(recp->h_version, ARCH_CONVERT,
+			XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? 2 : 1);
+	ASSIGN_ANY_LSN(recp->h_tail_lsn, tail_cycle, tail_block, ARCH_CONVERT);
+
+	for (i = start_block; i < end_block; i += bufblks) {
+		int bcount = min(bufblks, end_block - start_block);
+		/* with plenty of memory, we duplicate the block
+		 * right through the buffer and modify each entry
+		 */
+		ASSIGN_ANY_LSN(recp->h_lsn, cycle, i, ARCH_CONVERT);
+		for (j = 1; j < bcount; j++) {
+			buf += BBSIZE;
+			recp = (xlog_rec_header_t*)buf;
+			memcpy(buf, XFS_BUF_PTR(bp), BBSIZE);
+			ASSIGN_ANY_LSN(recp->h_lsn, cycle, i+j, ARCH_CONVERT);
+		}
+		/* then write the whole lot out at once */
+		error = xlog_bwrite(log, start_block, bcount, bp);
+		start_block += bcount;
+		buf = XFS_BUF_PTR(bp);
+		recp = (xlog_rec_header_t*)buf;
+	}
+	xlog_put_bp(bp);
+
+	return error;
+}
+
+/*
+ * This routine is called to blow away any incomplete log writes out
+ * in front of the log head.  We do this so that we won't become confused
+ * if we come up, write only a little bit more, and then crash again.
+ * If we leave the partial log records out there, this situation could
+ * cause us to think those partial writes are valid blocks since they
+ * have the current cycle number.  We get rid of them by overwriting them
+ * with empty log records with the old cycle number rather than the
+ * current one.
+ *
+ * The tail lsn is passed in rather than taken from
+ * the log so that we will not write over the unmount record after a
+ * clean unmount in a 512 block log.  Doing so would leave the log without
+ * any valid log records in it until a new one was written.  If we crashed
+ * during that time we would not be able to recover.
+ */
+STATIC int
+xlog_clear_stale_blocks(
+	xlog_t		*log,
+	xfs_lsn_t	tail_lsn)
+{
+	int			tail_cycle;
+	int			head_cycle;
+	int			tail_block;
+	int			head_block;
+	int			tail_distance;
+	int			max_distance;
+	int			distance;
+	int			error;
+
+	tail_cycle = CYCLE_LSN(tail_lsn, ARCH_NOCONVERT);
+	tail_block = BLOCK_LSN(tail_lsn, ARCH_NOCONVERT);
+	head_cycle = log->l_curr_cycle;
+	head_block = log->l_curr_block;
+
+	/*
+	 * Figure out the distance between the new head of the log
+	 * and the tail.  We want to write over any blocks beyond the
+	 * head that we may have written just before the crash, but
+	 * we don't want to overwrite the tail of the log.
+	 */
+	if (head_cycle == tail_cycle) {
+		/*
+		 * The tail is behind the head in the physical log,
+		 * so the distance from the head to the tail is the
+		 * distance from the head to the end of the log plus
+		 * the distance from the beginning of the log to the
+		 * tail.
+		 */
+		if (head_block < tail_block || head_block >= log->l_logBBsize)
+			return XFS_ERROR(EFSCORRUPTED);
+		tail_distance = tail_block +
+				(log->l_logBBsize - head_block);
+	} else {
+		/*
+		 * The head is behind the tail in the physical log,
+		 * so the distance from the head to the tail is just
+		 * the tail block minus the head block.
+		 */
+		if (head_block >= tail_block || head_cycle != (tail_cycle + 1))
+			return XFS_ERROR(EFSCORRUPTED);
+		tail_distance = tail_block - head_block;
+	}
+
+	/*
+	 * If the head is right up against the tail, we can't clear
+	 * anything.
+	 */
+	if (tail_distance <= 0) {
+		ASSERT(tail_distance == 0);
+		return 0;
+	}
+
+	max_distance = XLOG_TOTAL_REC_SHIFT(log);
+	/*
+	 * Take the smaller of the maximum amount of outstanding I/O
+	 * we could have and the distance to the tail to clear out.
+	 * We take the smaller so that we don't overwrite the tail and
+	 * we don't waste all day writing from the head to the tail
+	 * for no reason.
+	 */
+	max_distance = MIN(max_distance, tail_distance);
+
+	if ((head_block + max_distance) <= log->l_logBBsize) {
+		/*
+		 * We can stomp all the blocks we need to without
+		 * wrapping around the end of the log.	Just do it
+		 * in a single write.  Use the cycle number of the
+		 * current cycle minus one so that the log will look like:
+		 *     n ... | n - 1 ...
+		 */
+		error = xlog_write_log_records(log, (head_cycle - 1),
+				head_block, max_distance, tail_cycle,
+				tail_block);
+		if (error)
+			return error;
+	} else {
+		/*
+		 * We need to wrap around the end of the physical log in
+		 * order to clear all the blocks.  Do it in two separate
+		 * I/Os.  The first write should be from the head to the
+		 * end of the physical log, and it should use the current
+		 * cycle number minus one just like above.
+		 */
+		distance = log->l_logBBsize - head_block;
+		error = xlog_write_log_records(log, (head_cycle - 1),
+				head_block, distance, tail_cycle,
+				tail_block);
+
+		if (error)
+			return error;
+
+		/*
+		 * Now write the blocks at the start of the physical log.
+		 * This writes the remainder of the blocks we want to clear.
+		 * It uses the current cycle number since we're now on the
+		 * same cycle as the head so that we get:
+		 *    n ... n ... | n - 1 ...
+		 *    ^^^^^ blocks we're writing
+		 */
+		distance = max_distance - (log->l_logBBsize - head_block);
+		error = xlog_write_log_records(log, head_cycle, 0, distance,
+				tail_cycle, tail_block);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
+/******************************************************************************
+ *
+ *		Log recover routines
+ *
+ ******************************************************************************
+ */
+
+STATIC xlog_recover_t *
+xlog_recover_find_tid(xlog_recover_t *q,
+		      xlog_tid_t     tid)
+{
+	xlog_recover_t *p = q;
+
+	while (p != NULL) {
+		if (p->r_log_tid == tid)
+		    break;
+		p = p->r_next;
+	}
+	return p;
+}	/* xlog_recover_find_tid */
+
+
+STATIC void
+xlog_recover_put_hashq(xlog_recover_t **q,
+		       xlog_recover_t *trans)
+{
+	trans->r_next = *q;
+	*q = trans;
+}	/* xlog_recover_put_hashq */
+
+
+STATIC void
+xlog_recover_add_item(xlog_recover_item_t **itemq)
+{
+	xlog_recover_item_t *item;
+
+	item = kmem_zalloc(sizeof(xlog_recover_item_t), 0);
+	xlog_recover_insert_item_backq(itemq, item);
+}	/* xlog_recover_add_item */
+
+
+STATIC int
+xlog_recover_add_to_cont_trans(xlog_recover_t	*trans,
+			       xfs_caddr_t		dp,
+			       int		len)
+{
+	xlog_recover_item_t	*item;
+	xfs_caddr_t			ptr, old_ptr;
+	int			old_len;
+
+	item = trans->r_itemq;
+	if (item == 0) {
+		/* finish copying rest of trans header */
+		xlog_recover_add_item(&trans->r_itemq);
+		ptr = (xfs_caddr_t)&trans->r_theader+sizeof(xfs_trans_header_t)-len;
+		bcopy(dp, ptr, len); /* s, d, l */
+		return 0;
+	}
+	item = item->ri_prev;
+
+	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
+	old_len = item->ri_buf[item->ri_cnt-1].i_len;
+
+	ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0);
+	bcopy(dp , &ptr[old_len], len); /* s, d, l */
+	item->ri_buf[item->ri_cnt-1].i_len += len;
+	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
+	return 0;
+}	/* xlog_recover_add_to_cont_trans */
+
+
+/* The next region to add is the start of a new region.	 It could be
+ * a whole region or it could be the first part of a new region.  Because
+ * of this, the assumption here is that the type and size fields of all
+ * format structures fit into the first 32 bits of the structure.
+ *
+ * This works because all regions must be 32 bit aligned.  Therefore, we
+ * either have both fields or we have neither field.  In the case we have
+ * neither field, the data part of the region is zero length.  We only have
+ * a log_op_header and can throw away the header since a new one will appear
+ * later.  If we have at least 4 bytes, then we can determine how many regions
+ * will appear in the current log item.
+ */
+STATIC int
+xlog_recover_add_to_trans(xlog_recover_t	*trans,
+			  xfs_caddr_t		dp,
+			  int			len)
+{
+	xfs_inode_log_format_t	 *in_f;			/* any will do */
+	xlog_recover_item_t	 *item;
+	xfs_caddr_t		 ptr;
+
+	if (!len)
+		return 0;
+	ptr = kmem_zalloc(len, 0);
+	bcopy(dp, ptr, len);
+
+	in_f = (xfs_inode_log_format_t *)ptr;
+	item = trans->r_itemq;
+	if (item == 0) {
+		ASSERT(*(uint *)dp == XFS_TRANS_HEADER_MAGIC);
+		if (len == sizeof(xfs_trans_header_t))
+			xlog_recover_add_item(&trans->r_itemq);
+		bcopy(dp, &trans->r_theader, len); /* s, d, l */
+		return 0;
+	}
+	if (item->ri_prev->ri_total != 0 &&
+	     item->ri_prev->ri_total == item->ri_prev->ri_cnt) {
+		xlog_recover_add_item(&trans->r_itemq);
+	}
+	item = trans->r_itemq;
+	item = item->ri_prev;
+
+	if (item->ri_total == 0) {		/* first region to be added */
+		item->ri_total	= in_f->ilf_size;
+		ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM);
+		item->ri_buf = kmem_zalloc((item->ri_total *
+					    sizeof(xfs_log_iovec_t)), 0);
+	}
+	ASSERT(item->ri_total > item->ri_cnt);
+	/* Description region is ri_buf[0] */
+	item->ri_buf[item->ri_cnt].i_addr = ptr;
+	item->ri_buf[item->ri_cnt].i_len  = len;
+	item->ri_cnt++;
+	return 0;
+}	/* xlog_recover_add_to_trans */
+
+
+STATIC void
+xlog_recover_new_tid(xlog_recover_t	**q,
+		     xlog_tid_t		tid,
+		     xfs_lsn_t		lsn)
+{
+	xlog_recover_t	*trans;
+
+	trans = kmem_zalloc(sizeof(xlog_recover_t), 0);
+	trans->r_log_tid   = tid;
+	trans->r_lsn	   = lsn;
+	xlog_recover_put_hashq(q, trans);
+}	/* xlog_recover_new_tid */
+
+
+STATIC int
+xlog_recover_unlink_tid(xlog_recover_t	**q,
+			xlog_recover_t	*trans)
+{
+	xlog_recover_t	*tp;
+	int		found = 0;
+
+	ASSERT(trans != 0);
+	if (trans == *q) {
+		*q = (*q)->r_next;
+	} else {
+		tp = *q;
+		while (tp != 0) {
+			if (tp->r_next == trans) {
+				found = 1;
+				break;
+			}
+			tp = tp->r_next;
+		}
+		if (!found) {
+			xlog_warn(
+			     "XFS: xlog_recover_unlink_tid: trans not found");
+			ASSERT(0);
+			return XFS_ERROR(EIO);
+		}
+		tp->r_next = tp->r_next->r_next;
+	}
+	return 0;
+}	/* xlog_recover_unlink_tid */
+
+STATIC void
+xlog_recover_insert_item_backq(xlog_recover_item_t **q,
+			       xlog_recover_item_t *item)
+{
+	if (*q == 0) {
+		item->ri_prev = item->ri_next = item;
+		*q = item;
+	} else {
+		item->ri_next		= *q;
+		item->ri_prev		= (*q)->ri_prev;
+		(*q)->ri_prev		= item;
+		item->ri_prev->ri_next	= item;
+	}
+}	/* xlog_recover_insert_item_backq */
+
+STATIC void
+xlog_recover_insert_item_frontq(xlog_recover_item_t **q,
+				xlog_recover_item_t *item)
+{
+	xlog_recover_insert_item_backq(q, item);
+	*q = item;
+}	/* xlog_recover_insert_item_frontq */
+
+STATIC int
+xlog_recover_reorder_trans(xlog_t	  *log,
+			   xlog_recover_t *trans)
+{
+    xlog_recover_item_t *first_item, *itemq, *itemq_next;
+
+    first_item = itemq = trans->r_itemq;
+    trans->r_itemq = NULL;
+    do {
+	itemq_next = itemq->ri_next;
+	switch (ITEM_TYPE(itemq)) {
+	    case XFS_LI_BUF:
+	    case XFS_LI_6_1_BUF:
+	    case XFS_LI_5_3_BUF: {
+		xlog_recover_insert_item_frontq(&trans->r_itemq, itemq);
+		break;
+	    }
+	    case XFS_LI_INODE:
+	    case XFS_LI_6_1_INODE:
+	    case XFS_LI_5_3_INODE:
+	    case XFS_LI_DQUOT:
+	    case XFS_LI_QUOTAOFF:
+	    case XFS_LI_EFD:
+	    case XFS_LI_EFI: {
+		xlog_recover_insert_item_backq(&trans->r_itemq, itemq);
+		break;
+	    }
+	    default: {
+		xlog_warn(
+	"XFS: xlog_recover_reorder_trans: unrecognized type of log operation");
+		ASSERT(0);
+		return XFS_ERROR(EIO);
+	    }
+	}
+	itemq = itemq_next;
+    } while (first_item != itemq);
+    return 0;
+}	/* xlog_recover_reorder_trans */
+
+
+/*
+ * Build up the table of buf cancel records so that we don't replay
+ * cancelled data in the second pass.  For buffer records that are
+ * not cancel records, there is nothing to do here so we just return.
+ *
+ * If we get a cancel record which is already in the table, this indicates
+ * that the buffer was cancelled multiple times.  In order to ensure
+ * that during pass 2 we keep the record in the table until we reach its
+ * last occurrence in the log, we keep a reference count in the cancel
+ * record in the table to tell us how many times we expect to see this
+ * record during the second pass.
+ */
+STATIC void
+xlog_recover_do_buffer_pass1(xlog_t			*log,
+			     xfs_buf_log_format_t	*buf_f)
+{
+	xfs_buf_cancel_t	*bcp;
+	xfs_buf_cancel_t	*nextp;
+	xfs_buf_cancel_t	*prevp;
+	xfs_buf_cancel_t	**bucket;
+	xfs_buf_log_format_v1_t *obuf_f;
+	xfs_daddr_t		blkno=0;
+	uint			len=0;
+	ushort			flags=0;
+
+	switch (buf_f->blf_type) {
+	case XFS_LI_BUF:
+		blkno = buf_f->blf_blkno;
+		len = buf_f->blf_len;
+		flags = buf_f->blf_flags;
+		break;
+	case XFS_LI_6_1_BUF:
+	case XFS_LI_5_3_BUF:
+		obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
+		blkno = (xfs_daddr_t) obuf_f->blf_blkno;
+		len = obuf_f->blf_len;
+		flags = obuf_f->blf_flags;
+		break;
+	}
+
+	/*
+	 * If this isn't a cancel buffer item, then just return.
+	 */
+	if (!(flags & XFS_BLI_CANCEL)) {
+		return;
+	}
+
+	/*
+	 * Insert an xfs_buf_cancel record into the hash table of
+	 * them.  If there is already an identical record, bump
+	 * its reference count.
+	 */
+	bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
+					  XLOG_BC_TABLE_SIZE];
+	/*
+	 * If the hash bucket is empty then just insert a new record into
+	 * the bucket.
+	 */
+	if (*bucket == NULL) {
+		bcp = (xfs_buf_cancel_t*)kmem_alloc(sizeof(xfs_buf_cancel_t),
+						    KM_SLEEP);
+		bcp->bc_blkno = blkno;
+		bcp->bc_len = len;
+		bcp->bc_refcount = 1;
+		bcp->bc_next = NULL;
+		*bucket = bcp;
+		return;
+	}
+
+	/*
+	 * The hash bucket is not empty, so search for duplicates of our
+	 * record.  If we find one them just bump its refcount.	 If not
+	 * then add us at the end of the list.
+	 */
+	prevp = NULL;
+	nextp = *bucket;
+	while (nextp != NULL) {
+		if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
+			nextp->bc_refcount++;
+			return;
+		}
+		prevp = nextp;
+		nextp = nextp->bc_next;
+	}
+	ASSERT(prevp != NULL);
+	bcp = (xfs_buf_cancel_t*)kmem_alloc(sizeof(xfs_buf_cancel_t),
+					    KM_SLEEP);
+	bcp->bc_blkno = blkno;
+	bcp->bc_len = len;
+	bcp->bc_refcount = 1;
+	bcp->bc_next = NULL;
+	prevp->bc_next = bcp;
+}
+
+/*
+ * Check to see whether the buffer being recovered has a corresponding
+ * entry in the buffer cancel record table.  If it does then return 1
+ * so that it will be cancelled, otherwise return 0.  If the buffer is
+ * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement
+ * the refcount on the entry in the table and remove it from the table
+ * if this is the last reference.
+ *
+ * We remove the cancel record from the table when we encounter its
+ * last occurrence in the log so that if the same buffer is re-used
+ * again after its last cancellation we actually replay the changes
+ * made at that point.
+ */
+STATIC int
+xlog_recover_do_buffer_pass2(xlog_t			*log,
+			     xfs_buf_log_format_t	*buf_f)
+{
+	xfs_buf_cancel_t	*bcp;
+	xfs_buf_cancel_t	*prevp;
+	xfs_buf_cancel_t	**bucket;
+	xfs_buf_log_format_v1_t *obuf_f;
+	xfs_daddr_t		blkno=0;
+	ushort			flags=0;
+	uint			len=0;
+
+
+	switch (buf_f->blf_type) {
+	case XFS_LI_BUF:
+		blkno = buf_f->blf_blkno;
+		flags = buf_f->blf_flags;
+		len = buf_f->blf_len;
+		break;
+	case XFS_LI_6_1_BUF:
+	case XFS_LI_5_3_BUF:
+		obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
+		blkno = (xfs_daddr_t) obuf_f->blf_blkno;
+		flags = obuf_f->blf_flags;
+		len = (xfs_daddr_t) obuf_f->blf_len;
+		break;
+	}
+	if (log->l_buf_cancel_table == NULL) {
+		/*
+		 * There is nothing in the table built in pass one,
+		 * so this buffer must not be cancelled.
+		 */
+		ASSERT(!(flags & XFS_BLI_CANCEL));
+		return 0;
+	}
+
+	bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
+					  XLOG_BC_TABLE_SIZE];
+	bcp = *bucket;
+	if (bcp == NULL) {
+		/*
+		 * There is no corresponding entry in the table built
+		 * in pass one, so this buffer has not been cancelled.
+		 */
+		ASSERT(!(flags & XFS_BLI_CANCEL));
+		return 0;
+	}
+
+	/*
+	 * Search for an entry in the buffer cancel table that
+	 * matches our buffer.
+	 */
+	prevp = NULL;
+	while (bcp != NULL) {
+		if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
+			/*
+			 * We've go a match, so return 1 so that the
+			 * recovery of this buffer is cancelled.
+			 * If this buffer is actually a buffer cancel
+			 * log item, then decrement the refcount on the
+			 * one in the table and remove it if this is the
+			 * last reference.
+			 */
+			if (flags & XFS_BLI_CANCEL) {
+				bcp->bc_refcount--;
+				if (bcp->bc_refcount == 0) {
+					if (prevp == NULL) {
+						*bucket = bcp->bc_next;
+					} else {
+						prevp->bc_next = bcp->bc_next;
+					}
+					kmem_free(bcp,
+						  sizeof(xfs_buf_cancel_t));
+				}
+			}
+			return 1;
+		}
+		prevp = bcp;
+		bcp = bcp->bc_next;
+	}
+	/*
+	 * We didn't find a corresponding entry in the table, so
+	 * return 0 so that the buffer is NOT cancelled.
+	 */
+	ASSERT(!(flags & XFS_BLI_CANCEL));
+	return 0;
+}
+
+
+/*
+ * Perform recovery for a buffer full of inodes.  In these buffers,
+ * the only data which should be recovered is that which corresponds
+ * to the di_next_unlinked pointers in the on disk inode structures.
+ * The rest of the data for the inodes is always logged through the
+ * inodes themselves rather than the inode buffer and is recovered
+ * in xlog_recover_do_inode_trans().
+ *
+ * The only time when buffers full of inodes are fully recovered is
+ * when the buffer is full of newly allocated inodes.  In this case
+ * the buffer will not be marked as an inode buffer and so will be
+ * sent to xlog_recover_do_reg_buffer() below during recovery.
+ */
+STATIC int
+xlog_recover_do_inode_buffer(xfs_mount_t		*mp,
+			     xlog_recover_item_t	*item,
+			     xfs_buf_t			*bp,
+			     xfs_buf_log_format_t	*buf_f)
+{
+	int			i;
+	int			item_index;
+	int			bit;
+	int			nbits;
+	int			reg_buf_offset;
+	int			reg_buf_bytes;
+	int			next_unlinked_offset;
+	int			inodes_per_buf;
+	xfs_agino_t		*logged_nextp;
+	xfs_agino_t		*buffer_nextp;
+	xfs_buf_log_format_v1_t *obuf_f;
+	unsigned int		*data_map=NULL;
+	unsigned int		map_size=0;
+
+	switch (buf_f->blf_type) {
+	case XFS_LI_BUF:
+		data_map = buf_f->blf_data_map;
+		map_size = buf_f->blf_map_size;
+		break;
+	case XFS_LI_6_1_BUF:
+	case XFS_LI_5_3_BUF:
+		obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
+		data_map = obuf_f->blf_data_map;
+		map_size = obuf_f->blf_map_size;
+		break;
+	}
+	/*
+	 * Set the variables corresponding to the current region to
+	 * 0 so that we'll initialize them on the first pass through
+	 * the loop.
+	 */
+	reg_buf_offset = 0;
+	reg_buf_bytes = 0;
+	bit = 0;
+	nbits = 0;
+	item_index = 0;
+	inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
+	for (i = 0; i < inodes_per_buf; i++) {
+		next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
+			offsetof(xfs_dinode_t, di_next_unlinked);
+
+		while (next_unlinked_offset >=
+		       (reg_buf_offset + reg_buf_bytes)) {
+			/*
+			 * The next di_next_unlinked field is beyond
+			 * the current logged region.  Find the next
+			 * logged region that contains or is beyond
+			 * the current di_next_unlinked field.
+			 */
+			bit += nbits;
+			bit = xfs_next_bit(data_map, map_size, bit);
+
+			/*
+			 * If there are no more logged regions in the
+			 * buffer, then we're done.
+			 */
+			if (bit == -1) {
+				return 0;
+			}
+
+			nbits = xfs_contig_bits(data_map, map_size,
+							 bit);
+			reg_buf_offset = bit << XFS_BLI_SHIFT;
+			reg_buf_bytes = nbits << XFS_BLI_SHIFT;
+			item_index++;
+		}
+
+		/*
+		 * If the current logged region starts after the current
+		 * di_next_unlinked field, then move on to the next
+		 * di_next_unlinked field.
+		 */
+		if (next_unlinked_offset < reg_buf_offset) {
+			continue;
+		}
+
+		ASSERT(item->ri_buf[item_index].i_addr != NULL);
+		ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0);
+		ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
+
+		/*
+		 * The current logged region contains a copy of the
+		 * current di_next_unlinked field.  Extract its value
+		 * and copy it to the buffer copy.
+		 */
+		logged_nextp = (xfs_agino_t *)
+			       ((char *)(item->ri_buf[item_index].i_addr) +
+				(next_unlinked_offset - reg_buf_offset));
+		if (*logged_nextp == 0)	 {
+			xfs_fs_cmn_err(CE_ALERT, mp,
+				"bad inode buffer log record (ptr = 0x%p, bp = 0x%p).  XFS trying to replay bad (0) inode di_next_unlinked field",
+				item, bp);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+
+		buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
+					      next_unlinked_offset);
+		INT_SET(*buffer_nextp, ARCH_CONVERT, *logged_nextp);
+	}
+
+	return 0;
+}	/* xlog_recover_do_inode_buffer */
+
+/*
+ * Perform a 'normal' buffer recovery.	Each logged region of the
+ * buffer should be copied over the corresponding region in the
+ * given buffer.  The bitmap in the buf log format structure indicates
+ * where to place the logged data.
+ */
+/*ARGSUSED*/
+STATIC void
+xlog_recover_do_reg_buffer(xfs_mount_t		*mp,
+			   xlog_recover_item_t	*item,
+			   xfs_buf_t		*bp,
+			   xfs_buf_log_format_t *buf_f)
+{
+	int			i;
+	int			bit;
+	int			nbits;
+	xfs_buf_log_format_v1_t *obuf_f;
+	unsigned int		*data_map=NULL;
+	unsigned int		map_size=0;
+	int			error;
+
+	switch (buf_f->blf_type) {
+	case XFS_LI_BUF:
+		data_map = buf_f->blf_data_map;
+		map_size = buf_f->blf_map_size;
+		break;
+	case XFS_LI_6_1_BUF:
+	case XFS_LI_5_3_BUF:
+		obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
+		data_map = obuf_f->blf_data_map;
+		map_size = obuf_f->blf_map_size;
+		break;
+	}
+	bit = 0;
+	i = 1;	/* 0 is the buf format structure */
+	while (1) {
+		bit = xfs_next_bit(data_map, map_size, bit);
+		if (bit == -1)
+			break;
+		nbits = xfs_contig_bits(data_map, map_size, bit);
+		ASSERT(item->ri_buf[i].i_addr != 0);
+		ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0);
+		ASSERT(XFS_BUF_COUNT(bp) >=
+		       ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT));
+
+		/*
+		 * Do a sanity check if this is a dquot buffer. Just checking
+		 * the first dquot in the buffer should do. XXXThis is
+		 * probably a good thing to do for other buf types also.
+		 */
+		error = 0;
+		if (buf_f->blf_flags & (XFS_BLI_UDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
+			/* OK, if this returns nopkg() */
+			error = xfs_qm_dqcheck((xfs_disk_dquot_t *)
+					       item->ri_buf[i].i_addr,
+					       -1, 0, XFS_QMOPT_DOWARN,
+					       "dquot_buf_recover");
+		}
+		if (!error)
+		    bcopy(item->ri_buf[i].i_addr,		   /* source */
+		      xfs_buf_offset(bp, (uint)bit << XFS_BLI_SHIFT), /* dest */
+		      nbits<<XFS_BLI_SHIFT);			   /* length */
+		i++;
+		bit += nbits;
+	}
+
+	/* Shouldn't be any more regions */
+	ASSERT(i == item->ri_total);
+}	/* xlog_recover_do_reg_buffer */
+
+
+/*
+ * Perform a dquot buffer recovery.
+ * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
+ * (ie. USR or GRP), then just toss this buffer away; don't recover it.
+ * Else, treat it as a regular buffer and do recovery.
+ */
+STATIC void
+xlog_recover_do_dquot_buffer(
+	xfs_mount_t		*mp,
+	xlog_t			*log,
+	xlog_recover_item_t	*item,
+	xfs_buf_t		*bp,
+	xfs_buf_log_format_t	*buf_f)
+{
+	uint type;
+
+	/*
+	 * Non-root filesystems are required to send in quota flags
+	 * at mount time.
+	 */
+	if (mp->m_qflags == 0 && mp->m_dev != rootdev) {
+		return;
+	}
+	
+	type = 0;
+	if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF)
+		type |= XFS_DQ_USER;
+	if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF)
+		type |= XFS_DQ_GROUP;
+	/*
+	 * This type of quotas was turned off, so ignore this buffer
+	 */
+	if (log->l_quotaoffs_flag & type)
+		return;
+
+	xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+}
+
+/*
+ * This routine replays a modification made to a buffer at runtime.
+ * There are actually two types of buffer, regular and inode, which
+ * are handled differently.  Inode buffers are handled differently
+ * in that we only recover a specific set of data from them, namely
+ * the inode di_next_unlinked fields.  This is because all other inode
+ * data is actually logged via inode records and any data we replay
+ * here which overlaps that may be stale.
+ *
+ * When meta-data buffers are freed at run time we log a buffer item
+ * with the XFS_BLI_CANCEL bit set to indicate that previous copies
+ * of the buffer in the log should not be replayed at recovery time.
+ * This is so that if the blocks covered by the buffer are reused for
+ * file data before we crash we don't end up replaying old, freed
+ * meta-data into a user's file.
+ *
+ * To handle the cancellation of buffer log items, we make two passes
+ * over the log during recovery.  During the first we build a table of
+ * those buffers which have been cancelled, and during the second we
+ * only replay those buffers which do not have corresponding cancel
+ * records in the table.  See xlog_recover_do_buffer_pass[1,2] above
+ * for more details on the implementation of the table of cancel records.
+ */
+STATIC int
+xlog_recover_do_buffer_trans(xlog_t		 *log,
+			     xlog_recover_item_t *item,
+			     int		 pass)
+{
+	xfs_buf_log_format_t	*buf_f;
+	xfs_buf_log_format_v1_t *obuf_f;
+	xfs_mount_t		*mp;
+	xfs_buf_t		*bp;
+	int			error;
+	int			cancel;
+	xfs_daddr_t		blkno;
+	int			len;
+	ushort			flags;
+
+	buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
+
+	if (pass == XLOG_RECOVER_PASS1) {
+		/*
+		 * In this pass we're only looking for buf items
+		 * with the XFS_BLI_CANCEL bit set.
+		 */
+		xlog_recover_do_buffer_pass1(log, buf_f);
+		return 0;
+	} else {
+		/*
+		 * In this pass we want to recover all the buffers
+		 * which have not been cancelled and are not
+		 * cancellation buffers themselves.  The routine
+		 * we call here will tell us whether or not to
+		 * continue with the replay of this buffer.
+		 */
+		cancel = xlog_recover_do_buffer_pass2(log, buf_f);
+		if (cancel) {
+			return 0;
+		}
+	}
+	switch (buf_f->blf_type) {
+	      case XFS_LI_BUF:
+		blkno = buf_f->blf_blkno;
+		len = buf_f->blf_len;
+		flags = buf_f->blf_flags;
+		break;
+	      case XFS_LI_6_1_BUF:
+	      case XFS_LI_5_3_BUF:
+		obuf_f = (xfs_buf_log_format_v1_t*)buf_f;
+		blkno = obuf_f->blf_blkno;
+		len = obuf_f->blf_len;
+		flags = obuf_f->blf_flags;
+		break;
+	      default:
+		xfs_fs_cmn_err(CE_ALERT, log->l_mp,
+			"xfs_log_recover: unknown buffer type 0x%x, dev 0x%x",
+			buf_f->blf_type, log->l_dev);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	mp = log->l_mp;
+	if (flags & XFS_BLI_INODE_BUF) {
+		bp = xfs_buf_read_flags(mp->m_ddev_targp, blkno, len,
+								XFS_BUF_LOCK);
+	} else {
+		bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, 0);
+	}
+	if (XFS_BUF_ISERROR(bp)) {
+		xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
+				  bp, blkno);
+		error = XFS_BUF_GETERROR(bp);
+		xfs_buf_relse(bp);
+		return error;
+	}
+
+	error = 0;
+	if (flags & XFS_BLI_INODE_BUF) {
+		error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
+	} else if (flags & (XFS_BLI_UDQUOT_BUF | XFS_BLI_GDQUOT_BUF)) {
+		xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
+	} else {
+		xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
+	}
+	if (error)
+		return XFS_ERROR(error);
+
+	/*
+	 * Perform delayed write on the buffer.	 Asynchronous writes will be
+	 * slower when taking into account all the buffers to be flushed.
+	 *
+	 * Also make sure that only inode buffers with good sizes stay in
+	 * the buffer cache.  The kernel moves inodes in buffers of 1 block
+	 * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger.  The inode
+	 * buffers in the log can be a different size if the log was generated
+	 * by an older kernel using unclustered inode buffers or a newer kernel
+	 * running with a different inode cluster size.	 Regardless, if the
+	 * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
+	 * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
+	 * the buffer out of the buffer cache so that the buffer won't
+	 * overlap with future reads of those inodes.
+	 */
+	error = 0;
+
+	if ((INT_GET(*((__uint16_t *)(xfs_buf_offset(bp, 0))), ARCH_CONVERT) == XFS_DINODE_MAGIC) &&
+	    (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
+			(__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
+	  XFS_BUF_STALE(bp);
+	  error = xfs_bwrite(mp, bp);
+	} else {
+		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
+		       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
+		XFS_BUF_SET_FSPRIVATE(bp, mp);
+		XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
+		xfs_bdwrite(mp, bp);
+	}
+
+	return (error);
+}	/* xlog_recover_do_buffer_trans */
+
+STATIC int
+xlog_recover_do_inode_trans(xlog_t		*log,
+			    xlog_recover_item_t *item,
+			    int			pass)
+{
+	xfs_inode_log_format_t	*in_f;
+	xfs_mount_t		*mp;
+	xfs_buf_t		*bp;
+	xfs_imap_t		imap;
+	xfs_dinode_t		*dip;
+	xfs_ino_t		ino;
+	int			len;
+	xfs_caddr_t		src;
+	xfs_caddr_t		dest;
+	int			error;
+	int			attr_index;
+	uint			fields;
+	xfs_dinode_core_t	*dicp;
+
+	if (pass == XLOG_RECOVER_PASS1) {
+		return 0;
+	}
+
+	in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr;
+	ino = in_f->ilf_ino;
+	mp = log->l_mp;
+	if (ITEM_TYPE(item) == XFS_LI_INODE) {
+		imap.im_blkno = (xfs_daddr_t)in_f->ilf_blkno;
+		imap.im_len = in_f->ilf_len;
+		imap.im_boffset = in_f->ilf_boffset;
+	} else {
+		/*
+		 * It's an old inode format record.  We don't know where
+		 * its cluster is located on disk, and we can't allow
+		 * xfs_imap() to figure it out because the inode btrees
+		 * are not ready to be used.  Therefore do not pass the
+		 * XFS_IMAP_LOOKUP flag to xfs_imap().	This will give
+		 * us only the single block in which the inode lives
+		 * rather than its cluster, so we must make sure to
+		 * invalidate the buffer when we write it out below.
+		 */
+		imap.im_blkno = 0;
+		xfs_imap(log->l_mp, 0, ino, &imap, 0);
+	}
+	bp = xfs_buf_read_flags(mp->m_ddev_targp, imap.im_blkno, imap.im_len,
+								XFS_BUF_LOCK);
+	if (XFS_BUF_ISERROR(bp)) {
+		xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
+				  bp, imap.im_blkno);
+		error = XFS_BUF_GETERROR(bp);
+		xfs_buf_relse(bp);
+		return error;
+	}
+	error = 0;
+	xfs_inobp_check(mp, bp);
+	ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
+	dip = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
+
+	/*
+	 * Make sure the place we're flushing out to really looks
+	 * like an inode!
+	 */
+	if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC) {
+		xfs_buf_relse(bp);
+		xfs_fs_cmn_err(CE_ALERT, mp,
+			"xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
+			dip, bp, ino);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	dicp = (xfs_dinode_core_t*)(item->ri_buf[1].i_addr);
+	if (dicp->di_magic != XFS_DINODE_MAGIC) {
+		xfs_buf_relse(bp);
+		xfs_fs_cmn_err(CE_ALERT, mp,
+			"xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
+			item, ino);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	if ((dicp->di_mode & IFMT) == IFREG) {
+		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
+		    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
+			xfs_buf_relse(bp);
+			xfs_fs_cmn_err(CE_ALERT, mp,
+				"xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+				item, dip, bp, ino);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+	} else if ((dicp->di_mode & IFMT) == IFDIR) {
+		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
+		    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
+		    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
+			xfs_buf_relse(bp);
+			xfs_fs_cmn_err(CE_ALERT, mp,
+				"xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+				item, dip, bp, ino);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+	}
+	if (dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks) {
+		xfs_buf_relse(bp);
+		xfs_fs_cmn_err(CE_ALERT, mp,
+			"xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
+			item, dip, bp, ino,
+			dicp->di_nextents + dicp->di_anextents,
+			dicp->di_nblocks);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	if (dicp->di_forkoff > mp->m_sb.sb_inodesize) {
+		xfs_buf_relse(bp);
+		xfs_fs_cmn_err(CE_ALERT, mp,
+			"xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
+			item, dip, bp, ino, dicp->di_forkoff);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+	if (item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t)) {
+		xfs_buf_relse(bp);
+		xfs_fs_cmn_err(CE_ALERT, mp,
+			"xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p",
+			item->ri_buf[1].i_len, item);
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	/* The core is in in-core format */
+	xfs_xlate_dinode_core((xfs_caddr_t)&dip->di_core,
+			      (xfs_dinode_core_t*)item->ri_buf[1].i_addr,
+			      -1, ARCH_CONVERT);
+	/* the rest is in on-disk format */
+	if (item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t)) {
+		bcopy(item->ri_buf[1].i_addr + sizeof(xfs_dinode_core_t),
+		      (xfs_caddr_t) dip		 + sizeof(xfs_dinode_core_t),
+		      item->ri_buf[1].i_len  - sizeof(xfs_dinode_core_t));
+	}
+
+	fields = in_f->ilf_fields;
+	switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
+	case XFS_ILOG_DEV:
+		INT_SET(dip->di_u.di_dev, ARCH_CONVERT, in_f->ilf_u.ilfu_rdev);
+
+		break;
+	case XFS_ILOG_UUID:
+		dip->di_u.di_muuid = in_f->ilf_u.ilfu_uuid;
+		break;
+	}
+
+	if (in_f->ilf_size == 2)
+		goto write_inode_buffer;
+	len = item->ri_buf[2].i_len;
+	src = item->ri_buf[2].i_addr;
+	ASSERT(in_f->ilf_size <= 4);
+	ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
+	ASSERT(!(fields & XFS_ILOG_DFORK) ||
+	       (len == in_f->ilf_dsize));
+
+	switch (fields & XFS_ILOG_DFORK) {
+	case XFS_ILOG_DDATA:
+	case XFS_ILOG_DEXT:
+		bcopy(src, &dip->di_u, len);
+		break;
+
+	case XFS_ILOG_DBROOT:
+		xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
+				 &(dip->di_u.di_bmbt),
+				 XFS_DFORK_DSIZE(dip, mp));
+		break;
+
+	default:
+		/*
+		 * There are no data fork flags set.
+		 */
+		ASSERT((fields & XFS_ILOG_DFORK) == 0);
+		break;
+	}
+
+
+
+	/*
+	 * If we logged any attribute data, recover it.	 There may or
+	 * may not have been any other non-core data logged in this
+	 * transaction.
+	 */
+	if (in_f->ilf_fields & XFS_ILOG_AFORK) {
+		if (in_f->ilf_fields & XFS_ILOG_DFORK) {
+			attr_index = 3;
+		} else {
+			attr_index = 2;
+		}
+		len = item->ri_buf[attr_index].i_len;
+		src = item->ri_buf[attr_index].i_addr;
+		ASSERT(len == in_f->ilf_asize);
+
+		switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
+		case XFS_ILOG_ADATA:
+		case XFS_ILOG_AEXT:
+			dest = XFS_DFORK_APTR(dip);
+			ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
+			bcopy(src, dest, len);
+			break;
+
+		case XFS_ILOG_ABROOT:
+			dest = XFS_DFORK_APTR(dip);
+			xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
+					 (xfs_bmdr_block_t*)dest,
+					 XFS_DFORK_ASIZE(dip, mp));
+			break;
+
+		default:
+			xlog_warn("XFS: xlog_recover_do_inode_trans: Illegal flag");
+			ASSERT(0);
+			xfs_buf_relse(bp);
+			return XFS_ERROR(EIO);
+		}
+	}
+
+
+write_inode_buffer:
+#if 0
+	/*
+	 * Can't do this if the transaction didn't log the current
+	 * contents, e.g. rmdir.
+	 */
+	XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp, dip);
+#endif
+	xfs_inobp_check(mp, bp);
+	if (ITEM_TYPE(item) == XFS_LI_INODE) {
+		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
+		       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
+		XFS_BUF_SET_FSPRIVATE(bp, mp);
+		XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
+		xfs_bdwrite(mp, bp);
+	} else {
+		XFS_BUF_STALE(bp);
+		error = xfs_bwrite(mp, bp);
+	}
+
+	return (error);
+}	/* xlog_recover_do_inode_trans */
+
+
+/*
+ * Recover QUOTAOFF records. We simply make a note of it in the xlog_t
+ * structure, so that we know not to do any dquot item or dquot buffer recovery,
+ * of that type.
+ */
+STATIC int
+xlog_recover_do_quotaoff_trans(xlog_t			*log,
+			       xlog_recover_item_t	*item,
+			       int			pass)
+{
+	xfs_qoff_logformat_t *qoff_f;
+
+	if (pass == XLOG_RECOVER_PASS2) {
+		return (0);
+	}
+
+	qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr;
+	ASSERT(qoff_f);
+
+	/*
+	 * The logitem format's flag tells us if this was user quotaoff,
+	 * group quotaoff or both.
+	 */
+	if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
+		log->l_quotaoffs_flag |= XFS_DQ_USER;
+	if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
+		log->l_quotaoffs_flag |= XFS_DQ_GROUP;
+
+	return (0);
+}
+
+
+/*
+ * Recover a dquot record
+ */
+STATIC int
+xlog_recover_do_dquot_trans(xlog_t		*log,
+			    xlog_recover_item_t *item,
+			    int			pass)
+{
+	xfs_mount_t		*mp;
+	xfs_buf_t		*bp;
+	struct xfs_disk_dquot	*ddq, *recddq;
+	int			error;
+	xfs_dq_logformat_t	*dq_f;
+	uint			type;
+
+	if (pass == XLOG_RECOVER_PASS1) {
+		return 0;
+	}
+	mp = log->l_mp;
+
+	/*
+	 * Non-root filesystems are required to send in quota flags
+	 * at mount time.
+	 */
+	if (mp->m_qflags == 0 && mp->m_dev != rootdev) {
+		return (0);
+	}
+
+	recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr;
+	ASSERT(recddq);
+	/*
+	 * This type of quotas was turned off, so ignore this record.
+	 */
+	type = INT_GET(recddq->d_flags, ARCH_CONVERT)&(XFS_DQ_USER|XFS_DQ_GROUP);
+	ASSERT(type);
+	if (log->l_quotaoffs_flag & type)
+		return (0);
+
+	/*
+	 * At this point we know that if we are recovering a root filesystem
+	 * then quota was _not_ turned off. Since there is no other flag
+	 * indicate to us otherwise, this must mean that quota's on,
+	 * and the dquot needs to be replayed. Remember that we may not have
+	 * fully recovered the superblock yet, so we can't do the usual trick
+	 * of looking at the SB quota bits.
+	 *
+	 * The other possibility, of course, is that the quota subsystem was
+	 * removed since the last mount - nopkg().
+	 */
+	dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr;
+	ASSERT(dq_f);
+	if ((error = xfs_qm_dqcheck(recddq,
+			   dq_f->qlf_id,
+			   0, XFS_QMOPT_DOWARN,
+			   "xlog_recover_do_dquot_trans (log copy)"))) {
+	  if ((error == nopkg()))
+			return (0);
+		return XFS_ERROR(EIO);
+	}
+	ASSERT(dq_f->qlf_len == 1);
+
+	error = xfs_read_buf(mp, mp->m_ddev_targp,
+			     dq_f->qlf_blkno,
+			     XFS_FSB_TO_BB(mp, dq_f->qlf_len),
+			     0, &bp);
+	if (error) {
+		xfs_ioerror_alert("xlog_recover_do..(read#3)", mp,
+				  bp, dq_f->qlf_blkno);
+		return error;
+	}
+	ASSERT(bp);
+	ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
+
+	/*
+	 * At least the magic num portion should be on disk because this
+	 * was among a chunk of dquots created earlier, and we did some
+	 * minimal initialization then.
+	 */
+	if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+			   "xlog_recover_do_dquot_trans")) {
+		xfs_buf_relse(bp);
+		return XFS_ERROR(EIO);
+	}
+
+	bcopy(recddq, ddq, item->ri_buf[1].i_len);
+
+	ASSERT(dq_f->qlf_size == 2);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
+	       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
+	XFS_BUF_SET_FSPRIVATE(bp, mp);
+	XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
+	xfs_bdwrite(mp, bp);
+
+	return (0);
+}	/* xlog_recover_do_dquot_trans */
+
+/*
+ * This routine is called to create an in-core extent free intent
+ * item from the efi format structure which was logged on disk.
+ * It allocates an in-core efi, copies the extents from the format
+ * structure into it, and adds the efi to the AIL with the given
+ * LSN.
+ */
+STATIC void
+xlog_recover_do_efi_trans(xlog_t		*log,
+			  xlog_recover_item_t	*item,
+			  xfs_lsn_t		lsn,
+			  int			pass)
+{
+	xfs_mount_t		*mp;
+	xfs_efi_log_item_t	*efip;
+	xfs_efi_log_format_t	*efi_formatp;
+	SPLDECL(s);
+
+	if (pass == XLOG_RECOVER_PASS1) {
+		return;
+	}
+
+	efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr;
+	ASSERT(item->ri_buf[0].i_len ==
+	       (sizeof(xfs_efi_log_format_t) +
+		((efi_formatp->efi_nextents - 1) * sizeof(xfs_extent_t))));
+
+	mp = log->l_mp;
+	efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
+	bcopy((char *)efi_formatp, (char *)&(efip->efi_format),
+	      sizeof(xfs_efi_log_format_t) +
+	      ((efi_formatp->efi_nextents - 1) * sizeof(xfs_extent_t)));
+	efip->efi_next_extent = efi_formatp->efi_nextents;
+	efip->efi_flags |= XFS_EFI_COMMITTED;
+
+	AIL_LOCK(mp,s);
+	/*
+	 * xfs_trans_update_ail() drops the AIL lock.
+	 */
+	xfs_trans_update_ail(mp, (xfs_log_item_t *)efip, lsn, s);
+}	/* xlog_recover_do_efi_trans */
+
+
+/*
+ * This routine is called when an efd format structure is found in
+ * a committed transaction in the log.	It's purpose is to cancel
+ * the corresponding efi if it was still in the log.  To do this
+ * it searches the AIL for the efi with an id equal to that in the
+ * efd format structure.  If we find it, we remove the efi from the
+ * AIL and free it.
+ */
+STATIC void
+xlog_recover_do_efd_trans(xlog_t		*log,
+			  xlog_recover_item_t	*item,
+			  int			pass)
+{
+	xfs_mount_t		*mp;
+	xfs_efd_log_format_t	*efd_formatp;
+	xfs_efi_log_item_t	*efip=NULL;
+	xfs_log_item_t		*lip;
+	int			gen;
+	int			nexts;
+	__uint64_t		efi_id;
+	SPLDECL(s);
+
+	if (pass == XLOG_RECOVER_PASS1) {
+		return;
+	}
+
+	efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr;
+	ASSERT(item->ri_buf[0].i_len ==
+	       (sizeof(xfs_efd_log_format_t) +
+		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_t))));
+	efi_id = efd_formatp->efd_efi_id;
+
+	/*
+	 * Search for the efi with the id in the efd format structure
+	 * in the AIL.
+	 */
+	mp = log->l_mp;
+	AIL_LOCK(mp,s);
+	lip = xfs_trans_first_ail(mp, &gen);
+	while (lip != NULL) {
+		if (lip->li_type == XFS_LI_EFI) {
+			efip = (xfs_efi_log_item_t *)lip;
+			if (efip->efi_format.efi_id == efi_id) {
+				/*
+				 * xfs_trans_delete_ail() drops the
+				 * AIL lock.
+				 */
+				xfs_trans_delete_ail(mp, lip, s);
+				break;
+			}
+		}
+		lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+	}
+	if (lip == NULL) {
+		AIL_UNLOCK(mp, s);
+	}
+
+	/*
+	 * If we found it, then free it up.  If it wasn't there, it
+	 * must have been overwritten in the log.  Oh well.
+	 */
+	if (lip != NULL) {
+		nexts = efip->efi_format.efi_nextents;
+		if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
+			kmem_free(lip, sizeof(xfs_efi_log_item_t) +
+				  ((nexts - 1) * sizeof(xfs_extent_t)));
+		} else {
+			kmem_zone_free(xfs_efi_zone, efip);
+	}
+	}
+}	/* xlog_recover_do_efd_trans */
+
+/*
+ * Perform the transaction
+ *
+ * If the transaction modifies a buffer or inode, do it now.  Otherwise,
+ * EFIs and EFDs get queued up by adding entries into the AIL for them.
+ */
+STATIC int
+xlog_recover_do_trans(xlog_t	     *log,
+		      xlog_recover_t *trans,
+		      int	     pass)
+{
+	int		    error = 0;
+	xlog_recover_item_t *item, *first_item;
+
+	if ((error = xlog_recover_reorder_trans(log, trans)))
+		return error;
+	first_item = item = trans->r_itemq;
+	do {
+		/*
+		 * we don't need to worry about the block number being
+		 * truncated in > 1 TB buffers because in user-land,
+		 * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so
+		 * the blkno's will get through the user-mode buffer
+		 * cache properly.  The only bad case is o32 kernels
+		 * where xfs_daddr_t is 32-bits but mount will warn us
+		 * off a > 1 TB filesystem before we get here.
+		 */
+		if ((ITEM_TYPE(item) == XFS_LI_BUF) ||
+		    (ITEM_TYPE(item) == XFS_LI_6_1_BUF) ||
+		    (ITEM_TYPE(item) == XFS_LI_5_3_BUF)) {
+			if  ((error = xlog_recover_do_buffer_trans(log, item,
+								 pass)))
+				break;
+		} else if ((ITEM_TYPE(item) == XFS_LI_INODE) ||
+			   (ITEM_TYPE(item) == XFS_LI_6_1_INODE) ||
+			   (ITEM_TYPE(item) == XFS_LI_5_3_INODE)) {
+			if ((error = xlog_recover_do_inode_trans(log, item,
+								pass)))
+				break;
+		} else if (ITEM_TYPE(item) == XFS_LI_EFI) {
+			xlog_recover_do_efi_trans(log, item, trans->r_lsn,
+						  pass);
+		} else if (ITEM_TYPE(item) == XFS_LI_EFD) {
+			xlog_recover_do_efd_trans(log, item, pass);
+		} else if (ITEM_TYPE(item) == XFS_LI_DQUOT) {
+			if ((error = xlog_recover_do_dquot_trans(log, item,
+								   pass)))
+					break;
+		} else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) {
+			if ((error = xlog_recover_do_quotaoff_trans(log, item,
+								   pass)))
+					break;
+		} else {
+			xlog_warn("XFS: xlog_recover_do_trans");
+			ASSERT(0);
+			error = XFS_ERROR(EIO);
+			break;
+		}
+		item = item->ri_next;
+	} while (first_item != item);
+
+	return error;
+}	/* xlog_recover_do_trans */
+
+
+/*
+ * Free up any resources allocated by the transaction
+ *
+ * Remember that EFIs, EFDs, and IUNLINKs are handled later.
+ */
+STATIC void
+xlog_recover_free_trans(xlog_recover_t	    *trans)
+{
+	xlog_recover_item_t *first_item, *item, *free_item;
+	int i;
+
+	item = first_item = trans->r_itemq;
+	do {
+		free_item = item;
+		item = item->ri_next;
+		 /* Free the regions in the item. */
+		for (i = 0; i < free_item->ri_cnt; i++) {
+			kmem_free(free_item->ri_buf[i].i_addr,
+				  free_item->ri_buf[i].i_len);
+		}
+		/* Free the item itself */
+		kmem_free(free_item->ri_buf,
+			  (free_item->ri_total * sizeof(xfs_log_iovec_t)));
+		kmem_free(free_item, sizeof(xlog_recover_item_t));
+	} while (first_item != item);
+	/* Free the transaction recover structure */
+	kmem_free(trans, sizeof(xlog_recover_t));
+}	/* xlog_recover_free_trans */
+
+
+STATIC int
+xlog_recover_commit_trans(xlog_t	 *log,
+			  xlog_recover_t **q,
+			  xlog_recover_t *trans,
+			  int		 pass)
+{
+	int error;
+
+	if ((error = xlog_recover_unlink_tid(q, trans)))
+		return error;
+	if ((error = xlog_recover_do_trans(log, trans, pass)))
+		return error;
+	xlog_recover_free_trans(trans);			/* no error */
+	return 0;
+}	/* xlog_recover_commit_trans */
+
+
+/*ARGSUSED*/
+STATIC int
+xlog_recover_unmount_trans(xlog_recover_t *trans)
+{
+	/* Do nothing now */
+	xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR");
+	return( 0 );
+}	/* xlog_recover_unmount_trans */
+
+
+/*
+ * There are two valid states of the r_state field.  0 indicates that the
+ * transaction structure is in a normal state.	We have either seen the
+ * start of the transaction or the last operation we added was not a partial
+ * operation.  If the last operation we added to the transaction was a
+ * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
+ *
+ * NOTE: skip LRs with 0 data length.
+ */
+STATIC int
+xlog_recover_process_data(xlog_t	    *log,
+			  xlog_recover_t    *rhash[],
+			  xlog_rec_header_t *rhead,
+			  xfs_caddr_t	    dp,
+			  int		    pass)
+{
+    xfs_caddr_t		lp	   = dp+INT_GET(rhead->h_len, ARCH_CONVERT);
+    int			num_logops = INT_GET(rhead->h_num_logops, ARCH_CONVERT);
+    xlog_op_header_t	*ohead;
+    xlog_recover_t	*trans;
+    xlog_tid_t		tid;
+    int			error;
+    unsigned long	hash;
+    uint		flags;
+
+    /* check the log format matches our own - else we can't recover */
+    if (xlog_header_check_recover(log->l_mp, rhead))
+	    return (XFS_ERROR(EIO));
+
+    while ((dp < lp) && num_logops) {
+	ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
+	ohead = (xlog_op_header_t *)dp;
+	dp += sizeof(xlog_op_header_t);
+	if (ohead->oh_clientid != XFS_TRANSACTION &&
+	    ohead->oh_clientid != XFS_LOG) {
+	    xlog_warn("XFS: xlog_recover_process_data: bad clientid");
+	    ASSERT(0);
+	    return (XFS_ERROR(EIO));
+	}
+	tid = INT_GET(ohead->oh_tid, ARCH_CONVERT);
+	hash = XLOG_RHASH(tid);
+	trans = xlog_recover_find_tid(rhash[hash], tid);
+	if (trans == NULL) {			   /* not found; add new tid */
+	    if (ohead->oh_flags & XLOG_START_TRANS)
+		xlog_recover_new_tid(&rhash[hash], tid, INT_GET(rhead->h_lsn, ARCH_CONVERT));
+	} else {
+	    ASSERT(dp+INT_GET(ohead->oh_len, ARCH_CONVERT) <= lp);
+	    flags = ohead->oh_flags & ~XLOG_END_TRANS;
+	    if (flags & XLOG_WAS_CONT_TRANS)
+		flags &= ~XLOG_CONTINUE_TRANS;
+	    switch (flags) {
+		case XLOG_COMMIT_TRANS: {
+		    error = xlog_recover_commit_trans(log, &rhash[hash],
+						      trans, pass);
+		    break;
+		}
+		case XLOG_UNMOUNT_TRANS: {
+		    error = xlog_recover_unmount_trans(trans);
+		    break;
+		}
+		case XLOG_WAS_CONT_TRANS: {
+		    error = xlog_recover_add_to_cont_trans(trans, dp,
+				  INT_GET(ohead->oh_len, ARCH_CONVERT));
+		    break;
+		}
+		case XLOG_START_TRANS : {
+		    xlog_warn("XFS: xlog_recover_process_data: bad transaction");
+		    ASSERT(0);
+		    error = XFS_ERROR(EIO);
+		    break;
+		}
+		case 0:
+		case XLOG_CONTINUE_TRANS: {
+		    error = xlog_recover_add_to_trans(trans, dp,
+				   INT_GET(ohead->oh_len, ARCH_CONVERT));
+		    break;
+		}
+		default: {
+		    xlog_warn("XFS: xlog_recover_process_data: bad flag");
+		    ASSERT(0);
+		    error = XFS_ERROR(EIO);
+		    break;
+		}
+	    } /* switch */
+	    if (error)
+		return error;
+	} /* if */
+	dp += INT_GET(ohead->oh_len, ARCH_CONVERT);
+	num_logops--;
+    }
+    return( 0 );
+}	/* xlog_recover_process_data */
+
+
+/*
+ * Process an extent free intent item that was recovered from
+ * the log.  We need to free the extents that it describes.
+ */
+STATIC void
+xlog_recover_process_efi(xfs_mount_t		*mp,
+			 xfs_efi_log_item_t	*efip)
+{
+	xfs_efd_log_item_t	*efdp;
+	xfs_trans_t		*tp;
+	int			i;
+	xfs_extent_t		*extp;
+	xfs_fsblock_t		startblock_fsb;
+
+	ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
+
+	/*
+	 * First check the validity of the extents described by the
+	 * EFI.	 If any are bad, then assume that all are bad and
+	 * just toss the EFI.
+	 */
+	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
+		extp = &(efip->efi_format.efi_extents[i]);
+		startblock_fsb = XFS_BB_TO_FSB(mp,
+				   XFS_FSB_TO_DADDR(mp, extp->ext_start));
+		if ((startblock_fsb == 0) ||
+		    (extp->ext_len == 0) ||
+		    (startblock_fsb >= mp->m_sb.sb_dblocks) ||
+		    (extp->ext_len >= mp->m_sb.sb_agblocks)) {
+			/*
+			 * This will pull the EFI from the AIL and
+			 * free the memory associated with it.
+			 */
+			xfs_efi_release(efip, efip->efi_format.efi_nextents);
+			return;
+		}
+	}
+
+	tp = xfs_trans_alloc(mp, 0);
+	xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
+	efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
+
+	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
+		extp = &(efip->efi_format.efi_extents[i]);
+		xfs_free_extent(tp, extp->ext_start, extp->ext_len);
+		xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
+					 extp->ext_len);
+	}
+
+	efip->efi_flags |= XFS_EFI_RECOVERED;
+	xfs_trans_commit(tp, 0, NULL);
+}	/* xlog_recover_process_efi */
+
+
+/*
+ * Verify that once we've encountered something other than an EFI
+ * in the AIL that there are no more EFIs in the AIL.
+ */
+#if defined(DEBUG)
+STATIC void
+xlog_recover_check_ail(xfs_mount_t	*mp,
+		       xfs_log_item_t	*lip,
+		       int		gen)
+{
+	int	orig_gen;
+
+	orig_gen = gen;
+	do {
+		ASSERT(lip->li_type != XFS_LI_EFI);
+		lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+		/*
+		 * The check will be bogus if we restart from the
+		 * beginning of the AIL, so ASSERT that we don't.
+		 * We never should since we're holding the AIL lock
+		 * the entire time.
+		 */
+		ASSERT(gen == orig_gen);
+	} while (lip != NULL);
+}
+#endif	/* DEBUG */
+
+
+/*
+ * When this is called, all of the EFIs which did not have
+ * corresponding EFDs should be in the AIL.  What we do now
+ * is free the extents associated with each one.
+ *
+ * Since we process the EFIs in normal transactions, they
+ * will be removed at some point after the commit.  This prevents
+ * us from just walking down the list processing each one.
+ * We'll use a flag in the EFI to skip those that we've already
+ * processed and use the AIL iteration mechanism's generation
+ * count to try to speed this up at least a bit.
+ *
+ * When we start, we know that the EFIs are the only things in
+ * the AIL.  As we process them, however, other items are added
+ * to the AIL.	Since everything added to the AIL must come after
+ * everything already in the AIL, we stop processing as soon as
+ * we see something other than an EFI in the AIL.
+ */
+STATIC void
+xlog_recover_process_efis(xlog_t	*log)
+{
+	xfs_log_item_t		*lip;
+	xfs_efi_log_item_t	*efip;
+	int			gen;
+	xfs_mount_t		*mp;
+	SPLDECL(s);
+
+	mp = log->l_mp;
+	AIL_LOCK(mp,s);
+
+	lip = xfs_trans_first_ail(mp, &gen);
+	while (lip != NULL) {
+		/*
+		 * We're done when we see something other than an EFI.
+		 */
+		if (lip->li_type != XFS_LI_EFI) {
+			xlog_recover_check_ail(mp, lip, gen);
+			break;
+		}
+
+		/*
+		 * Skip EFIs that we've already processed.
+		 */
+		efip = (xfs_efi_log_item_t *)lip;
+		if (efip->efi_flags & XFS_EFI_RECOVERED) {
+			lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+			continue;
+		}
+
+		AIL_UNLOCK(mp, s);
+		xlog_recover_process_efi(mp, efip);
+		AIL_LOCK(mp,s);
+		lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+	}
+	AIL_UNLOCK(mp, s);
+}	/* xlog_recover_process_efis */
+
+
+/*
+ * This routine performs a transaction to null out a bad inode pointer
+ * in an agi unlinked inode hash bucket.
+ */
+STATIC void
+xlog_recover_clear_agi_bucket(
+	xfs_mount_t	*mp,
+	xfs_agnumber_t	agno,
+	int		bucket)
+{
+	xfs_trans_t	*tp;
+	xfs_agi_t	*agi;
+	xfs_daddr_t	agidaddr;
+	xfs_buf_t	*agibp;
+	int		offset;
+	int		error;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
+	xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0);
+
+	agidaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR);
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agidaddr,
+				   1, 0, &agibp);
+	if (error) {
+		xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+		return;
+	}
+
+	agi = XFS_BUF_TO_AGI(agibp);
+	if (INT_GET(agi->agi_magicnum, ARCH_CONVERT) != XFS_AGI_MAGIC) {
+		xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+		return;
+	}
+	ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
+
+	INT_SET(agi->agi_unlinked[bucket], ARCH_CONVERT, NULLAGINO);
+	offset = offsetof(xfs_agi_t, agi_unlinked) +
+		 (sizeof(xfs_agino_t) * bucket);
+	xfs_trans_log_buf(tp, agibp, offset,
+			  (offset + sizeof(xfs_agino_t) - 1));
+
+	(void) xfs_trans_commit(tp, 0, NULL);
+}	/* xlog_recover_clear_agi_bucket */
+
+
+/*
+ * xlog_iunlink_recover
+ *
+ * This is called during recovery to process any inodes which
+ * we unlinked but not freed when the system crashed.  These
+ * inodes will be on the lists in the AGI blocks.  What we do
+ * here is scan all the AGIs and fully truncate and free any
+ * inodes found on the lists.  Each inode is removed from the
+ * lists when it has been fully truncated and is freed.	 The
+ * freeing of the inode and its removal from the list must be
+ * atomic.
+ */
+void
+xlog_recover_process_iunlinks(xlog_t	*log)
+{
+	xfs_mount_t	*mp;
+	xfs_agnumber_t	agno;
+	xfs_agi_t	*agi;
+	xfs_daddr_t	agidaddr;
+	xfs_buf_t	*agibp;
+	xfs_buf_t	*ibp;
+	xfs_dinode_t	*dip;
+	xfs_inode_t	*ip;
+	xfs_agino_t	agino;
+	xfs_ino_t	ino;
+	int		bucket;
+	int		error;
+	uint		mp_dmevmask;
+
+	mp = log->l_mp;
+
+	/*
+	 * Prevent any DMAPI event from being sent while in this function.
+	 * Not a problem for xfs since the file system isn't mounted
+	 * yet.	 It is a problem for cxfs recovery.
+	 */
+	mp_dmevmask = mp->m_dmevmask;
+	mp->m_dmevmask = 0;
+
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		/*
+		 * Find the agi for this ag.
+		 */
+		agidaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR);
+		agibp = xfs_buf_read(mp->m_ddev_targp, agidaddr, 1, 0);
+		if (XFS_BUF_ISERROR(agibp)) {
+			xfs_ioerror_alert("xlog_recover_process_iunlinks(agi#1)",
+					  log->l_mp, agibp, agidaddr);
+		}
+		agi = XFS_BUF_TO_AGI(agibp);
+		ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
+
+		for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
+
+			agino = INT_GET(agi->agi_unlinked[bucket], ARCH_CONVERT);
+			while (agino != NULLAGINO) {
+
+				/*
+				 * Release the agi buffer so that it can
+				 * be acquired in the normal course of the
+				 * transaction to truncate and free the inode.
+				 */
+				xfs_buf_relse(agibp);
+
+				ino = XFS_AGINO_TO_INO(mp, agno, agino);
+				error = xfs_iget(mp, NULL, ino, 0, &ip, 0);
+				ASSERT(error || (ip != NULL));
+
+				if (!error) {
+					/*
+					 * Get the on disk inode to find the
+					 * next inode in the bucket.
+					 */
+					error = xfs_itobp(mp, NULL, ip, &dip,
+							&ibp, 0);
+					ASSERT(error || (dip != NULL));
+				}
+
+				if (!error) {
+					ASSERT(ip->i_d.di_nlink == 0);
+					ASSERT(ip->i_d.di_mode != 0);
+
+					/* setup for the next pass */
+					agino = INT_GET(dip->di_next_unlinked,
+							ARCH_CONVERT);
+					xfs_buf_relse(ibp);
+					/*
+					 * Prevent any DMAPI event from
+					 * being sent when the
+					 * reference on the inode is
+					 * dropped.  Not a problem for
+					 * xfs since the file system
+					 * isn't mounted yet.  It is a
+					 * problem for cxfs recovery.
+					 */
+					ip->i_d.di_dmevmask = 0;
+
+					/*
+					 * Drop our reference to the
+					 * inode.  If there are no
+					 * other references, this will
+					 * send the inode to
+					 * xfs_inactive() which will
+					 * truncate the file and free
+					 * the inode.
+					 */
+					VN_RELE(XFS_ITOV(ip));
+				} else {
+					/*
+					 * We can't read in the inode
+					 * this bucket points to, or
+					 * this inode is messed up.  Just
+					 * ditch this bucket of inodes.	 We
+					 * will lose some inodes and space,
+					 * but at least we won't hang.	Call
+					 * xlog_recover_clear_agi_bucket()
+					 * to perform a transaction to clear
+					 * the inode pointer in the bucket.
+					 */
+					xlog_recover_clear_agi_bucket(mp, agno,
+							bucket);
+
+					agino = NULLAGINO;
+				}
+
+				/*
+				 * Reacquire the agibuffer and continue around
+				 * the loop.
+				 */
+				agidaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR);
+				agibp = xfs_buf_read(mp->m_ddev_targp,
+						 agidaddr, 1, 0);
+				if (XFS_BUF_ISERROR(agibp)) {
+					xfs_ioerror_alert("xlog_recover_process_iunlinks(agi#2)",
+							  log->l_mp, agibp, agidaddr);
+				}
+				agi = XFS_BUF_TO_AGI(agibp);
+				ASSERT(INT_GET(agi->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
+			}
+		}
+
+		/*
+		 * Release the buffer for the current agi so we can
+		 * go on to the next one.
+		 */
+		xfs_buf_relse(agibp);
+	}
+
+	mp->m_dmevmask = mp_dmevmask;
+
+}	/* xlog_recover_process_iunlinks */
+
+
+/*
+ * Stamp cycle number in every block
+ *
+ * This routine is also called in xfs_log.c
+ */
+/*ARGSUSED*/
+void
+xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog)
+{
+	int	i, j, k;
+	int	size = iclog->ic_offset + iclog->ic_roundoff;
+	xfs_caddr_t dp;
+	union ich {
+		xlog_rec_ext_header_t	hic_xheader;
+		char			hic_sector[XLOG_HEADER_SIZE];
+	} *xhdr;
+	uint	cycle_lsn;
+
+#ifdef DEBUG
+	uint	*up;
+	uint	chksum = 0;
+
+	up = (uint *)iclog->ic_datap;
+	/* divide length by 4 to get # words */
+	for (i=0; i<size >> 2; i++) {
+		chksum ^= INT_GET(*up, ARCH_CONVERT);
+		up++;
+	}
+	INT_SET(iclog->ic_header.h_chksum, ARCH_CONVERT, chksum);
+#endif /* DEBUG */
+
+	cycle_lsn = CYCLE_LSN_NOCONV(iclog->ic_header.h_lsn, ARCH_CONVERT);
+
+	dp = iclog->ic_datap;
+	for (i = 0; i < BTOBB(size) &&
+		i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
+		iclog->ic_header.h_cycle_data[i] = *(uint *)dp;
+		*(uint *)dp = cycle_lsn;
+		dp += BBSIZE;
+	}
+
+	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+		xhdr = (union ich*)&iclog->ic_header;
+		for ( ; i < BTOBB(size); i++) {
+			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			xhdr[j].hic_xheader.xh_cycle_data[k] = *(uint *)dp;
+			*(uint *)dp = cycle_lsn;
+			dp += BBSIZE;
+		}
+
+		for (i = 1; i < log->l_iclog_heads; i++) {
+			xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
+		}
+	}
+
+}	/* xlog_pack_data */
+
+
+/*ARGSUSED*/
+STATIC void
+xlog_unpack_data(xlog_rec_header_t *rhead,
+		 xfs_caddr_t	   dp,
+		 xlog_t		   *log)
+{
+	int i, j, k;
+	union ich {
+		xlog_rec_header_t	hic_header;
+		xlog_rec_ext_header_t	hic_xheader;
+		char			hic_sector[XLOG_HEADER_SIZE];
+	} *xhdr;
+
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+	uint *up = (uint *)dp;
+	uint chksum = 0;
+#endif
+
+	for (i=0; i < BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)) &&
+		  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
+		*(uint *)dp = *(uint *)&rhead->h_cycle_data[i];
+		dp += BBSIZE;
+	}
+
+	if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+		xhdr = (union ich*)rhead;
+		for ( ; i < BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)); i++) {
+			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			*(uint *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
+			dp += BBSIZE;
+		}
+	}
+
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+	/* divide length by 4 to get # words */
+	for (i=0; i < INT_GET(rhead->h_len, ARCH_CONVERT) >> 2; i++) {
+		chksum ^= INT_GET(*up, ARCH_CONVERT);
+		up++;
+	}
+	if (chksum != INT_GET(rhead->h_chksum, ARCH_CONVERT)) {
+	    if (!INT_ISZERO(rhead->h_chksum, ARCH_CONVERT) ||
+		((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
+		    cmn_err(CE_DEBUG,
+			"XFS: LogR chksum mismatch: was (0x%x) is (0x%x)",
+			    INT_GET(rhead->h_chksum, ARCH_CONVERT), chksum);
+		    cmn_err(CE_DEBUG,
+"XFS: Disregard message if filesystem was created with non-DEBUG kernel");
+		    if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+			    cmn_err(CE_DEBUG,
+				"XFS: LogR this is a LogV2 filesystem\n");
+		    }
+		    log->l_flags |= XLOG_CHKSUM_MISMATCH;
+	    }
+	}
+#endif /* DEBUG && XFS_LOUD_RECOVERY */
+}	/* xlog_unpack_data */
+
+
+/*
+ * Read the log from tail to head and process the log records found.
+ * Handle the two cases where the tail and head are in the same cycle
+ * and where the active portion of the log wraps around the end of
+ * the physical log separately.	 The pass parameter is passed through
+ * to the routines called to process the data and is not looked at
+ * here.
+ */
+STATIC int
+xlog_do_recovery_pass(xlog_t	*log,
+		      xfs_daddr_t	head_blk,
+		      xfs_daddr_t	tail_blk,
+		      int	pass)
+{
+    xlog_rec_header_t	*rhead;
+    xfs_daddr_t		blk_no;
+    xfs_caddr_t		bufaddr;
+    xfs_buf_t		*hbp, *dbp;
+    int			error, h_size;
+    int			bblks, split_bblks;
+    int			hblks, split_hblks, wrapped_hblks;
+    xlog_recover_t	*rhash[XLOG_RHASH_SIZE];
+
+    error = 0;
+
+
+    /*
+     * Read the header of the tail block and get the iclog buffer size from
+     * h_size.	Use this to tell how many sectors make up the log header.
+     */
+    if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) {
+	/*
+	 * When using variable length iclogs, read first sector of iclog
+	 * header and extract the header size from it.	Get a new hbp that
+	 * is the correct size.
+	 */
+	hbp = xlog_get_bp(1, log->l_mp);
+	if (!hbp)
+	    return ENOMEM;
+	if ((error = xlog_bread(log, tail_blk, 1, hbp)))
+	    goto bread_err1;
+	rhead = (xlog_rec_header_t *)XFS_BUF_PTR(hbp);
+	ASSERT(INT_GET(rhead->h_magicno, ARCH_CONVERT) ==
+						XLOG_HEADER_MAGIC_NUM);
+	if ((INT_GET(rhead->h_version, ARCH_CONVERT) & (~XLOG_VERSION_OKBITS)) != 0) {
+	    xlog_warn("XFS: xlog_do_recovery_pass: unrecognised log version number.");
+	    error = XFS_ERROR(EIO);
+	    goto bread_err1;
+	}
+	h_size = INT_GET(rhead->h_size, ARCH_CONVERT);
+
+	if ((INT_GET(rhead->h_version, ARCH_CONVERT) & XLOG_VERSION_2) &&
+	    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
+	    hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
+	    if (h_size % XLOG_HEADER_CYCLE_SIZE)
+		hblks++;
+	    xlog_put_bp(hbp);
+	    hbp = xlog_get_bp(hblks, log->l_mp);
+	} else {
+	    hblks=1;
+	}
+    } else {
+	hblks=1;
+	hbp = xlog_get_bp(1, log->l_mp);
+	h_size = XLOG_BIG_RECORD_BSIZE;
+    }
+
+    if (!hbp)
+	return ENOMEM;
+    dbp = xlog_get_bp(BTOBB(h_size),log->l_mp);
+    if (!dbp) {
+	xlog_put_bp(hbp);
+	return ENOMEM;
+    }
+
+    bzero(rhash, sizeof(rhash));
+    if (tail_blk <= head_blk) {
+	for (blk_no = tail_blk; blk_no < head_blk; ) {
+	    if ((error = xlog_bread(log, blk_no, hblks, hbp)))
+		goto bread_err2;
+	    rhead = (xlog_rec_header_t *)XFS_BUF_PTR(hbp);
+	    ASSERT(INT_GET(rhead->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
+	    ASSERT(BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT) <= INT_MAX));
+	    bblks = (int) BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));	/* blocks in data section */
+
+	    if ((INT_GET(rhead->h_magicno, ARCH_CONVERT) != XLOG_HEADER_MAGIC_NUM) ||
+		(BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT) > INT_MAX)) ||
+		(bblks <= 0) ||
+		(blk_no > log->l_logBBsize)) {
+		    error = EFSCORRUPTED;
+		    goto bread_err2;
+	    }
+
+	    if ((INT_GET(rhead->h_version, ARCH_CONVERT) & (~XLOG_VERSION_OKBITS)) != 0) {
+		xlog_warn("XFS: xlog_do_recovery_pass: unrecognised log version number.");
+		error = XFS_ERROR(EIO);
+		goto bread_err2;
+	    }
+	    bblks = (int) BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));	/* blocks in data section */
+	    if (bblks > 0) {
+		if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp)))
+		    goto bread_err2;
+		xlog_unpack_data(rhead, XFS_BUF_PTR(dbp), log);
+		if ((error = xlog_recover_process_data(log, rhash,
+						      rhead, XFS_BUF_PTR(dbp),
+						      pass)))
+			goto bread_err2;
+	    }
+	    blk_no += (bblks+hblks);
+	}
+    } else {
+	/*
+	 * Perform recovery around the end of the physical log.	 When the head
+	 * is not on the same cycle number as the tail, we can't do a sequential
+	 * recovery as above.
+	 */
+	blk_no = tail_blk;
+	while (blk_no < log->l_logBBsize) {
+	    /*
+	     * Check for header wrapping around physical end-of-log
+	     */
+	    wrapped_hblks = 0;
+	    if (blk_no+hblks <= log->l_logBBsize) {
+		/* Read header in one read */
+		if ((error = xlog_bread(log, blk_no, hblks, hbp)))
+		    goto bread_err2;
+	    } else {
+		/* This log record is split across physical end of log */
+		split_hblks = 0;
+		if (blk_no != log->l_logBBsize) {
+		    /* some data is before physical end of log */
+		    ASSERT(blk_no <= INT_MAX);
+		    split_hblks = log->l_logBBsize - (int)blk_no;
+		    ASSERT(split_hblks > 0);
+		    if ((error = xlog_bread(log, blk_no, split_hblks, hbp)))
+			goto bread_err2;
+		}
+		bufaddr = XFS_BUF_PTR(hbp);
+		XFS_BUF_SET_PTR(hbp, bufaddr + BBTOB(split_hblks),
+			BBTOB(hblks - split_hblks));
+		wrapped_hblks = hblks - split_hblks;
+		if ((error = xlog_bread(log, 0, wrapped_hblks, hbp)))
+		    goto bread_err2;
+		XFS_BUF_SET_PTR(hbp, bufaddr, hblks);
+	    }
+	    rhead = (xlog_rec_header_t *)XFS_BUF_PTR(hbp);
+	    ASSERT(INT_GET(rhead->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
+	    ASSERT(BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT) <= INT_MAX));
+	    bblks = (int) BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));
+
+	    /* LR body must have data or it wouldn't have been written */
+	    ASSERT(bblks > 0);
+	    blk_no += hblks;			/* successfully read header */
+
+	    if ((INT_GET(rhead->h_magicno, ARCH_CONVERT) != XLOG_HEADER_MAGIC_NUM) ||
+		(BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT) > INT_MAX)) ||
+		(bblks <= 0)) {
+		    error = EFSCORRUPTED;
+		    goto bread_err2;
+	    }
+
+	    /* Read in data for log record */
+	    if (blk_no+bblks <= log->l_logBBsize) {
+		if ((error = xlog_bread(log, blk_no, bblks, dbp)))
+		    goto bread_err2;
+	    } else {
+		/* This log record is split across physical end of log */
+		split_bblks = 0;
+		if (blk_no != log->l_logBBsize) {
+
+		    /* some data is before physical end of log */
+		    ASSERT(blk_no <= INT_MAX);
+		    split_bblks = log->l_logBBsize - (int)blk_no;
+		    ASSERT(split_bblks > 0);
+		    if ((error = xlog_bread(log, blk_no, split_bblks, dbp)))
+			goto bread_err2;
+		}
+		bufaddr = XFS_BUF_PTR(dbp);
+		XFS_BUF_SET_PTR(dbp, bufaddr + BBTOB(split_bblks),
+			BBTOB(bblks - split_bblks));
+		if ((error = xlog_bread(log, wrapped_hblks,
+					bblks - split_bblks, dbp)))
+		    goto bread_err2;
+		XFS_BUF_SET_PTR(dbp, bufaddr, XLOG_BIG_RECORD_BSIZE);
+	    }
+	    xlog_unpack_data(rhead, XFS_BUF_PTR(dbp), log);
+	    if ((error = xlog_recover_process_data(log, rhash,
+						  rhead, XFS_BUF_PTR(dbp),
+						  pass)))
+		goto bread_err2;
+	    blk_no += bblks;
+	}
+
+	ASSERT(blk_no >= log->l_logBBsize);
+	blk_no -= log->l_logBBsize;
+
+	/* read first part of physical log */
+	while (blk_no < head_blk) {
+	    if ((error = xlog_bread(log, blk_no, hblks, hbp)))
+		goto bread_err2;
+	    rhead = (xlog_rec_header_t *)XFS_BUF_PTR(hbp);
+	    ASSERT(INT_GET(rhead->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
+	    ASSERT(BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT) <= INT_MAX));
+	    bblks = (int) BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));
+	    ASSERT(bblks > 0);
+	    if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp)))
+		goto bread_err2;
+	    xlog_unpack_data(rhead, XFS_BUF_PTR(dbp), log);
+	    if ((error = xlog_recover_process_data(log, rhash,
+						  rhead, XFS_BUF_PTR(dbp),
+						  pass)))
+		goto bread_err2;
+	    blk_no += (bblks+hblks);
+	}
+    }
+
+bread_err2:
+    xlog_put_bp(dbp);
+bread_err1:
+    xlog_put_bp(hbp);
+
+    return error;
+}
+
+/*
+ * Do the recovery of the log.	We actually do this in two phases.
+ * The two passes are necessary in order to implement the function
+ * of cancelling a record written into the log.	 The first pass
+ * determines those things which have been cancelled, and the
+ * second pass replays log items normally except for those which
+ * have been cancelled.	 The handling of the replay and cancellations
+ * takes place in the log item type specific routines.
+ *
+ * The table of items which have cancel records in the log is allocated
+ * and freed at this level, since only here do we know when all of
+ * the log recovery has been completed.
+ */
+STATIC int
+xlog_do_log_recovery(xlog_t	*log,
+		     xfs_daddr_t	head_blk,
+		     xfs_daddr_t	tail_blk)
+{
+	int		error;
+#ifdef DEBUG
+	int		i;
+#endif
+
+	/*
+	 * First do a pass to find all of the cancelled buf log items.
+	 * Store them in the buf_cancel_table for use in the second pass.
+	 */
+	log->l_buf_cancel_table =
+		(xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
+						 sizeof(xfs_buf_cancel_t*),
+						 KM_SLEEP);
+	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
+				      XLOG_RECOVER_PASS1);
+	if (error != 0) {
+		kmem_free(log->l_buf_cancel_table,
+			  XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*));
+		log->l_buf_cancel_table = NULL;
+		return error;
+	}
+	/*
+	 * Then do a second pass to actually recover the items in the log.
+	 * When it is complete free the table of buf cancel items.
+	 */
+	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
+				      XLOG_RECOVER_PASS2);
+#ifdef	DEBUG
+	for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) {
+		ASSERT(log->l_buf_cancel_table[i] == NULL);
+	}
+#endif	/* DEBUG */
+	kmem_free(log->l_buf_cancel_table,
+		  XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*));
+	log->l_buf_cancel_table = NULL;
+
+	return error;
+}
+
+/*
+ * Do the actual recovery
+ */
+STATIC int
+xlog_do_recover(xlog_t	*log,
+		xfs_daddr_t head_blk,
+		xfs_daddr_t tail_blk)
+{
+	int		error;
+	xfs_buf_t	*bp;
+	xfs_sb_t	*sbp;
+
+	/*
+	 * First replay the images in the log.
+	 */
+	error = xlog_do_log_recovery(log, head_blk, tail_blk);
+	if (error) {
+		return error;
+	}
+
+	XFS_bflush(log->l_mp->m_ddev_targp);
+
+	/*
+	 * If IO errors happened during recovery, bail out.
+	 */
+	if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
+		return (EIO);
+	}
+
+	/*
+	 * We now update the tail_lsn since much of the recovery has completed
+	 * and there may be space available to use.  If there were no extent
+	 * or iunlinks, we can free up the entire log and set the tail_lsn to
+	 * be the last_sync_lsn.  This was set in xlog_find_tail to be the
+	 * lsn of the last known good LR on disk.  If there are extent frees
+	 * or iunlinks they will have some entries in the AIL; so we look at
+	 * the AIL to determine how to set the tail_lsn.
+	 */
+	xlog_assign_tail_lsn(log->l_mp, NULL);
+
+	/*
+	 * Now that we've finished replaying all buffer and inode
+	 * updates, re-read in the superblock.
+	 */
+	bp = xfs_getsb(log->l_mp, 0);
+	XFS_BUF_UNDONE(bp);
+	XFS_BUF_READ(bp);
+	xfsbdstrat(log->l_mp, bp);
+	if ((error = xfs_iowait(bp))) {
+		xfs_ioerror_alert("xlog_do_recover",
+				  log->l_mp, bp, XFS_BUF_ADDR(bp));
+		ASSERT(0);
+		xfs_buf_relse(bp);
+		return error;
+	}
+
+	/* convert superblock from on-disk format */
+
+	sbp=&log->l_mp->m_sb;
+	xfs_xlatesb(XFS_BUF_TO_SBP(bp), sbp, 1, ARCH_CONVERT, XFS_SB_ALL_BITS);
+	ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
+	ASSERT(XFS_SB_GOOD_VERSION(sbp));
+	xfs_buf_relse(bp);
+
+	xlog_recover_check_summary(log);
+
+	/* Normal transactions can now occur */
+	log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
+	return 0;
+}	/* xlog_do_recover */
+
+/*
+ * Perform recovery and re-initialize some log variables in xlog_find_tail.
+ *
+ * Return error or zero.
+ */
+int
+xlog_recover(xlog_t *log, int readonly)
+{
+	xfs_daddr_t head_blk, tail_blk;
+	int	error;
+
+	/* find the tail of the log */
+
+	if ((error = xlog_find_tail(log, &head_blk, &tail_blk, readonly)))
+		return error;
+
+	if (tail_blk != head_blk) {
+#ifndef __KERNEL__
+		extern xfs_daddr_t HEAD_BLK, TAIL_BLK;
+		head_blk = HEAD_BLK;
+		tail_blk = TAIL_BLK;
+#endif
+		/* There used to be a comment here:
+		 *
+		 * disallow recovery on read-only mounts.  note -- mount
+		 * checks for ENOSPC and turns it into an intelligent
+		 * error message.
+		 * ...but this is no longer true.  Now, unless you specify
+		 * NORECOVERY (in which case this function would never be
+		 * called), it enables read-write access long enough to do
+		 * recovery.
+		 */
+		if (readonly) {
+#ifdef __KERNEL__
+			if ((error = xfs_recover_read_only(log)))
+				return error;
+#else
+			return ENOSPC;
+#endif
+		}
+
+#ifdef __KERNEL__
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+		cmn_err(CE_NOTE,
+			"Starting XFS recovery on filesystem: %s (dev: %d/%d)",
+			log->l_mp->m_fsname, MAJOR(log->l_dev),
+			MINOR(log->l_dev));
+#else
+		cmn_err(CE_NOTE,
+			"!Starting XFS recovery on filesystem: %s (dev: %d/%d)",
+			log->l_mp->m_fsname, MAJOR(log->l_dev),
+			MINOR(log->l_dev));
+#endif
+#endif
+		error = xlog_do_recover(log, head_blk, tail_blk);
+		log->l_flags |= XLOG_RECOVERY_NEEDED;
+		if (readonly)
+			XFS_MTOVFS(log->l_mp)->vfs_flag |= VFS_RDONLY;
+	}
+	return error;
+}	/* xlog_recover */
+
+
+/*
+ * In the first part of recovery we replay inodes and buffers and build
+ * up the list of extent free items which need to be processed.	 Here
+ * we process the extent free items and clean up the on disk unlinked
+ * inode lists.	 This is separated from the first part of recovery so
+ * that the root and real-time bitmap inodes can be read in from disk in
+ * between the two stages.  This is necessary so that we can free space
+ * in the real-time portion of the file system.
+ */
+int
+xlog_recover_finish(xlog_t *log, int mfsi_flags)
+{
+	/*
+	 * Now we're ready to do the transactions needed for the
+	 * rest of recovery.  Start with completing all the extent
+	 * free intent records and then process the unlinked inode
+	 * lists.  At this point, we essentially run in normal mode
+	 * except that we're still performing recovery actions
+	 * rather than accepting new requests.
+	 */
+	if (log->l_flags & XLOG_RECOVERY_NEEDED) {
+		xlog_recover_process_efis(log);
+		/*
+		 * Sync the log to get all the EFIs out of the AIL.
+		 * This isn't absolutely necessary, but it helps in
+		 * case the unlink transactions would have problems
+		 * pushing the EFIs out of the way.
+		 */
+		xfs_log_force(log->l_mp, (xfs_lsn_t)0,
+			      (XFS_LOG_FORCE | XFS_LOG_SYNC));
+
+		if ( (mfsi_flags & XFS_MFSI_NOUNLINK) == 0 ) {
+
+			xlog_recover_process_iunlinks(log);
+		}
+
+		xlog_recover_check_summary(log);
+
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+		cmn_err(CE_NOTE,
+			"Ending XFS recovery on filesystem: %s (dev: %d/%d)",
+			log->l_mp->m_fsname, MAJOR(log->l_dev),
+			MINOR(log->l_dev));
+#else
+		cmn_err(CE_NOTE,
+			"!Ending XFS recovery on filesystem: %s (dev: %d/%d)",
+			log->l_mp->m_fsname, MAJOR(log->l_dev),
+			MINOR(log->l_dev));
+#endif
+		log->l_flags &= ~XLOG_RECOVERY_NEEDED;
+	} else {
+		cmn_err(CE_DEBUG,
+			"!Ending clean XFS mount for filesystem: %s\n",
+			log->l_mp->m_fsname);
+	}
+	return 0;
+}	/* xlog_recover_finish */
+
+
+#if defined(DEBUG)
+/*
+ * Read all of the agf and agi counters and check that they
+ * are consistent with the superblock counters.
+ */
+void
+xlog_recover_check_summary(xlog_t	*log)
+{
+	xfs_mount_t	*mp;
+	xfs_agf_t	*agfp;
+	xfs_agi_t	*agip;
+	xfs_buf_t	*agfbp;
+	xfs_buf_t	*agibp;
+	xfs_daddr_t	agfdaddr;
+	xfs_daddr_t	agidaddr;
+	xfs_buf_t	*sbbp;
+#ifdef XFS_LOUD_RECOVERY
+	xfs_sb_t	*sbp;
+#endif
+	xfs_agnumber_t	agno;
+	__uint64_t	freeblks;
+	__uint64_t	itotal;
+	__uint64_t	ifree;
+
+	mp = log->l_mp;
+
+	freeblks = 0LL;
+	itotal = 0LL;
+	ifree = 0LL;
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		agfdaddr = XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR);
+		agfbp = xfs_buf_read(mp->m_ddev_targp, agfdaddr, 1, 0);
+		if (XFS_BUF_ISERROR(agfbp)) {
+			xfs_ioerror_alert("xlog_recover_check_summary(agf)",
+					  log->l_mp, agfbp, agfdaddr);
+		}
+		agfp = XFS_BUF_TO_AGF(agfbp);
+		ASSERT(INT_GET(agfp->agf_magicnum, ARCH_CONVERT) == XFS_AGF_MAGIC);
+		ASSERT(XFS_AGF_GOOD_VERSION(INT_GET(agfp->agf_versionnum, ARCH_CONVERT)));
+		ASSERT(INT_GET(agfp->agf_seqno, ARCH_CONVERT) == agno);
+
+		freeblks += INT_GET(agfp->agf_freeblks, ARCH_CONVERT) +
+			    INT_GET(agfp->agf_flcount, ARCH_CONVERT);
+		xfs_buf_relse(agfbp);
+
+		agidaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR);
+		agibp = xfs_buf_read(mp->m_ddev_targp, agidaddr, 1, 0);
+		if (XFS_BUF_ISERROR(agibp)) {
+			xfs_ioerror_alert("xlog_recover_check_summary(agi)",
+					  log->l_mp, agibp, agidaddr);
+		}
+		agip = XFS_BUF_TO_AGI(agibp);
+		ASSERT(INT_GET(agip->agi_magicnum, ARCH_CONVERT) == XFS_AGI_MAGIC);
+		ASSERT(XFS_AGI_GOOD_VERSION(INT_GET(agip->agi_versionnum, ARCH_CONVERT)));
+		ASSERT(INT_GET(agip->agi_seqno, ARCH_CONVERT) == agno);
+
+		itotal += INT_GET(agip->agi_count, ARCH_CONVERT);
+		ifree += INT_GET(agip->agi_freecount, ARCH_CONVERT);
+		xfs_buf_relse(agibp);
+	}
+
+	sbbp = xfs_getsb(mp, 0);
+#ifdef XFS_LOUD_RECOVERY
+	sbp = XFS_BUF_TO_SBP(sbbp);
+	cmn_err(CE_NOTE,
+		"xlog_recover_check_summary: sb_icount %Lu itotal %Lu",
+		sbp->sb_icount, itotal);
+	cmn_err(CE_NOTE,
+		"xlog_recover_check_summary: sb_ifree %Lu itotal %Lu",
+		sbp->sb_ifree, ifree);
+	cmn_err(CE_NOTE,
+		"xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu",
+		sbp->sb_fdblocks, freeblks);
+#if 0
+	/*
+	 * This is turned off until I account for the allocation
+	 * btree blocks which live in free space.
+	 */
+	ASSERT(sbp->sb_icount == itotal);
+	ASSERT(sbp->sb_ifree == ifree);
+	ASSERT(sbp->sb_fdblocks == freeblks);
+#endif
+#endif
+	xfs_buf_relse(sbbp);
+}
+#endif /* DEBUG */
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
new file mode 100644
index 000000000000..b3bf72c1c47c
--- /dev/null
+++ b/fs/xfs/xfs_log_recover.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_LOG_RECOVER_H__
+#define __XFS_LOG_RECOVER_H__
+
+/*
+ * Macros, structures, prototypes for internal log manager use.
+ */
+
+#define XLOG_RHASH_BITS	 4
+#define XLOG_RHASH_SIZE 16
+#define XLOG_RHASH_SHIFT 2
+#define XLOG_RHASH(tid) \
+	((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
+
+#define XLOG_MAX_REGIONS_IN_ITEM   (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1)
+
+
+/*
+ * item headers are in ri_buf[0].  Additional buffers follow.
+ */
+typedef struct xlog_recover_item {
+	struct xlog_recover_item *ri_next;
+	struct xlog_recover_item *ri_prev;
+	int			 ri_type;
+	int			 ri_cnt;	/* count of regions found */
+	int			 ri_total;	/* total regions */
+	xfs_log_iovec_t		 *ri_buf;	/* ptr to regions buffer */
+} xlog_recover_item_t;
+
+struct xlog_tid;
+typedef struct xlog_recover {
+	struct xlog_recover *r_next;
+	xlog_tid_t	    r_log_tid;		/* log's transaction id */
+	xfs_trans_header_t  r_theader;		/* trans header for partial */
+	int		    r_state;		/* not needed */
+	xfs_lsn_t	    r_lsn;		/* xact lsn */
+	xlog_recover_item_t *r_itemq;		/* q for items */
+} xlog_recover_t;
+
+#define ITEM_TYPE(i)	(*(ushort *)(i)->ri_buf[0].i_addr)
+
+/*
+ * This is the number of entries in the l_buf_cancel_table used during
+ * recovery.
+ */
+#define XLOG_BC_TABLE_SIZE	64
+
+#define XLOG_RECOVER_PASS1	1
+#define XLOG_RECOVER_PASS2	2
+
+#endif	/* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/xfs_mac.c b/fs/xfs/xfs_mac.c
new file mode 100644
index 000000000000..5b20891b7bb1
--- /dev/null
+++ b/fs/xfs/xfs_mac.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+static xfs_mac_label_t *mac_low_high_lp;
+static xfs_mac_label_t *mac_high_low_lp;
+static xfs_mac_label_t *mac_admin_high_lp;
+static xfs_mac_label_t *mac_equal_equal_lp;
+
+/*
+ * Test for the existence of a MAC label as efficiently as possible.
+ */
+int
+xfs_mac_vhaslabel(
+	vnode_t		*vp)
+{
+	int		error;
+	int		len = sizeof(xfs_mac_label_t);
+	int		flags = ATTR_KERNOVAL|ATTR_ROOT;
+
+	VOP_ATTR_GET(vp, SGI_MAC_FILE, NULL, &len, flags, sys_cred, error);
+	return (error == 0);
+}
+
+int
+xfs_mac_iaccess(xfs_inode_t *ip, mode_t mode, struct cred *cr)
+{
+	xfs_mac_label_t mac;
+	xfs_mac_label_t *mp = mac_high_low_lp;
+
+	if (cr == NULL || sys_cred == NULL ) {
+		return EACCES;
+	}
+
+	if (xfs_attr_fetch(ip, SGI_MAC_FILE, (char *)&mac, sizeof(mac)) == 0) {
+		if ((mp = mac_add_label(&mac)) == NULL) {
+			return mac_access(mac_high_low_lp, cr, mode);
+		}
+	}
+
+	return mac_access(mp, cr, mode);
+}
diff --git a/fs/xfs/xfs_mac.h b/fs/xfs/xfs_mac.h
new file mode 100644
index 000000000000..8a6179d9e591
--- /dev/null
+++ b/fs/xfs/xfs_mac.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2001-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_MAC_H__
+#define __XFS_MAC_H__
+
+/*
+ * Mandatory Access Control
+ *
+ * Layout of a composite MAC label:
+ * ml_list contains the list of categories (MSEN) followed by the list of
+ * divisions (MINT). This is actually a header for the data structure which
+ * will have an ml_list with more than one element.
+ *
+ *	-------------------------------
+ *	| ml_msen_type | ml_mint_type |
+ *	-------------------------------
+ *	| ml_level     | ml_grade     |
+ *	-------------------------------
+ *	| ml_catcount		      |
+ *	-------------------------------
+ *	| ml_divcount		      |
+ *	-------------------------------
+ *	| category 1		      |
+ *	| . . .			      |
+ *	| category N		      | (where N = ml_catcount)
+ *	-------------------------------
+ *	| division 1		      |
+ *	| . . .			      |
+ *	| division M		      | (where M = ml_divcount)
+ *	-------------------------------
+ */
+#define XFS_MAC_MAX_SETS	250
+typedef struct xfs_mac_label {
+	__uint8_t	ml_msen_type;	/* MSEN label type */
+	__uint8_t	ml_mint_type;	/* MINT label type */
+	__uint8_t	ml_level;	/* Hierarchical level */
+	__uint8_t	ml_grade;	/* Hierarchical grade */
+	__uint16_t	ml_catcount;	/* Category count */
+	__uint16_t	ml_divcount;	/* Division count */
+					/* Category set, then Division set */
+	__uint16_t	ml_list[XFS_MAC_MAX_SETS];
+} xfs_mac_label_t;
+
+/* MSEN label type names. Choose an upper case ASCII character.	 */
+#define XFS_MSEN_ADMIN_LABEL	'A'	/* Admin: low<admin != tcsec<high */
+#define XFS_MSEN_EQUAL_LABEL	'E'	/* Wildcard - always equal */
+#define XFS_MSEN_HIGH_LABEL	'H'	/* System High - always dominates */
+#define XFS_MSEN_MLD_HIGH_LABEL 'I'	/* System High, multi-level dir */
+#define XFS_MSEN_LOW_LABEL	'L'	/* System Low - always dominated */
+#define XFS_MSEN_MLD_LABEL	'M'	/* TCSEC label on a multi-level dir */
+#define XFS_MSEN_MLD_LOW_LABEL	'N'	/* System Low, multi-level dir */
+#define XFS_MSEN_TCSEC_LABEL	'T'	/* TCSEC label */
+#define XFS_MSEN_UNKNOWN_LABEL	'U'	/* unknown label */
+
+/* MINT label type names. Choose a lower case ASCII character.	*/
+#define XFS_MINT_BIBA_LABEL	'b'	/* Dual of a TCSEC label */
+#define XFS_MINT_EQUAL_LABEL	'e'	/* Wildcard - always equal */
+#define XFS_MINT_HIGH_LABEL	'h'	/* High Grade - always dominates */
+#define XFS_MINT_LOW_LABEL	'l'	/* Low Grade - always dominated */
+
+/* On-disk XFS extended attribute names */
+#define SGI_MAC_FILE	"SGI_MAC_FILE"
+#define SGI_MAC_FILE_SIZE	(sizeof(SGI_MAC_FILE)-1)
+
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_FS_POSIX_MAC
+
+/* NOT YET IMPLEMENTED */
+
+#define MACEXEC		00100
+#define MACWRITE	00200
+#define MACREAD		00400
+
+struct xfs_inode;
+extern int  xfs_mac_iaccess(struct xfs_inode *, mode_t, cred_t *);
+
+#define _MAC_XFS_IACCESS(i,m,c) (xfs_mac_iaccess(i,m,c))
+#define _MAC_VACCESS(v,c,m)	(xfs_mac_vaccess(v,c,m))
+#define _MAC_EXISTS		xfs_mac_vhaslabel
+
+#else
+#define _MAC_XFS_IACCESS(i,m,c) (0)
+#define _MAC_VACCESS(v,c,m)	(0)
+#define _MAC_EXISTS		(NULL)
+#endif
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_MAC_H__ */
diff --git a/fs/xfs/xfs_macros.c b/fs/xfs/xfs_macros.c
new file mode 100644
index 000000000000..022d2ae5e192
--- /dev/null
+++ b/fs/xfs/xfs_macros.c
@@ -0,0 +1,2228 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#define XFS_MACRO_C
+
+#include <xfs.h>
+
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_ISNULLDSTARTBLOCK)
+int
+isnulldstartblock(xfs_dfsbno_t x)
+{
+	return ISNULLDSTARTBLOCK(x);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_ISNULLSTARTBLOCK)
+int
+isnullstartblock(xfs_fsblock_t x)
+{
+	return ISNULLSTARTBLOCK(x);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_NULLSTARTBLOCK)
+xfs_fsblock_t
+nullstartblock(int k)
+{
+	return NULLSTARTBLOCK(k);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_STARTBLOCKVAL)
+xfs_filblks_t
+startblockval(xfs_fsblock_t x)
+{
+	return STARTBLOCKVAL(x);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AG_CHECK_DADDR)
+void
+xfs_ag_check_daddr(xfs_mount_t *mp, xfs_daddr_t d, xfs_extlen_t len)
+{
+	XFS_AG_CHECK_DADDR(mp, d, len);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AG_DADDR)
+xfs_daddr_t
+xfs_ag_daddr(xfs_mount_t *mp, xfs_agnumber_t agno, xfs_daddr_t d)
+{
+	return XFS_AG_DADDR(mp, agno, d);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AG_BEST_BLOCKS)
+xfs_extlen_t
+xfs_ag_best_blocks(int bl, xfs_drfsbno_t blks)
+{
+	return XFS_AG_BEST_BLOCKS(bl, blks);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AG_MAX_BLOCKS)
+xfs_extlen_t
+xfs_ag_max_blocks(int bl)
+{
+	return XFS_AG_MAX_BLOCKS(bl);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AG_MAXLEVELS)
+int
+xfs_ag_maxlevels(xfs_mount_t *mp)
+{
+	return XFS_AG_MAXLEVELS(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AG_MIN_BLOCKS)
+xfs_extlen_t
+xfs_ag_min_blocks(int bl)
+{
+	return XFS_AG_MIN_BLOCKS(bl);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGB_TO_DADDR)
+xfs_daddr_t
+xfs_agb_to_daddr(xfs_mount_t *mp, xfs_agnumber_t agno, xfs_agblock_t agbno)
+{
+	return XFS_AGB_TO_DADDR(mp, agno, agbno);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGB_TO_FSB)
+xfs_fsblock_t
+xfs_agb_to_fsb(xfs_mount_t *mp, xfs_agnumber_t agno, xfs_agblock_t agbno)
+{
+	return XFS_AGB_TO_FSB(mp, agno, agbno);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGBLOCK_MAX)
+xfs_agblock_t
+xfs_agblock_max(xfs_agblock_t a, xfs_agblock_t b)
+{
+	return XFS_AGBLOCK_MAX(a, b);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGBLOCK_MIN)
+xfs_agblock_t
+xfs_agblock_min(xfs_agblock_t a, xfs_agblock_t b)
+{
+	return XFS_AGBLOCK_MIN(a, b);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGF_BLOCK)
+xfs_agblock_t
+xfs_agf_block(xfs_mount_t *mp)
+{
+	return XFS_AGF_BLOCK(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGF_GOOD_VERSION)
+int
+xfs_agf_good_version(unsigned v)
+{
+	return XFS_AGF_GOOD_VERSION(v);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGFL_BLOCK)
+xfs_agblock_t
+xfs_agfl_block(xfs_mount_t *mp)
+{
+	return XFS_AGFL_BLOCK(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGI_BLOCK)
+xfs_agblock_t
+xfs_agi_block(xfs_mount_t *mp)
+{
+	return XFS_AGI_BLOCK(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGI_GOOD_VERSION)
+int
+xfs_agi_good_version(unsigned v)
+{
+	return XFS_AGI_GOOD_VERSION(v);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGINO_TO_AGBNO)
+xfs_agblock_t
+xfs_agino_to_agbno(xfs_mount_t *mp, xfs_agino_t i)
+{
+	return XFS_AGINO_TO_AGBNO(mp, i);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGINO_TO_INO)
+xfs_ino_t
+xfs_agino_to_ino(xfs_mount_t *mp, xfs_agnumber_t a, xfs_agino_t i)
+{
+	return XFS_AGINO_TO_INO(mp, a, i);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_AGINO_TO_OFFSET)
+int
+xfs_agino_to_offset(xfs_mount_t *mp, xfs_agino_t i)
+{
+	return XFS_AGINO_TO_OFFSET(mp, i);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_BLOCK_MAXRECS)
+int
+xfs_alloc_block_maxrecs(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_ALLOC_BLOCK_MAXRECS(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_BLOCK_MINRECS)
+int
+xfs_alloc_block_minrecs(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_ALLOC_BLOCK_MINRECS(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_BLOCK_SIZE)
+/*ARGSUSED1*/
+int
+xfs_alloc_block_size(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_ALLOC_BLOCK_SIZE(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_KEY_ADDR)
+/*ARGSUSED3*/
+xfs_alloc_key_t *
+xfs_alloc_key_addr(xfs_alloc_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_ALLOC_KEY_ADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_PTR_ADDR)
+xfs_alloc_ptr_t *
+xfs_alloc_ptr_addr(xfs_alloc_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_ALLOC_PTR_ADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ALLOC_REC_ADDR)
+/*ARGSUSED3*/
+xfs_alloc_rec_t *
+xfs_alloc_rec_addr(xfs_alloc_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_ALLOC_REC_ADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_ENTSIZE_LOCAL)
+int
+xfs_attr_leaf_entsize_local(int nlen, int vlen)
+{
+	return XFS_ATTR_LEAF_ENTSIZE_LOCAL(nlen, vlen);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX)
+int
+xfs_attr_leaf_entsize_local_max(int bsize)
+{
+	return XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(bsize);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_ENTSIZE_REMOTE)
+int
+xfs_attr_leaf_entsize_remote(int nlen)
+{
+	return XFS_ATTR_LEAF_ENTSIZE_REMOTE(nlen);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_NAME)
+char *
+xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
+{
+	return XFS_ATTR_LEAF_NAME(leafp, idx);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_NAME_LOCAL)
+xfs_attr_leaf_name_local_t *
+xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
+{
+	return XFS_ATTR_LEAF_NAME_LOCAL(leafp, idx);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_LEAF_NAME_REMOTE)
+xfs_attr_leaf_name_remote_t *
+xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
+{
+	return XFS_ATTR_LEAF_NAME_REMOTE(leafp, idx);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_SF_ENTSIZE)
+int
+xfs_attr_sf_entsize(xfs_attr_sf_entry_t *sfep)
+{
+	return XFS_ATTR_SF_ENTSIZE(sfep);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_SF_ENTSIZE_BYNAME)
+int
+xfs_attr_sf_entsize_byname(int nlen, int vlen)
+{
+	return XFS_ATTR_SF_ENTSIZE_BYNAME(nlen, vlen);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_SF_NEXTENTRY)
+xfs_attr_sf_entry_t *
+xfs_attr_sf_nextentry(xfs_attr_sf_entry_t *sfep)
+{
+	return XFS_ATTR_SF_NEXTENTRY(sfep);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ATTR_SF_TOTSIZE)
+int
+xfs_attr_sf_totsize(xfs_inode_t *dp)
+{
+	return XFS_ATTR_SF_TOTSIZE(dp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BHVTOI)
+xfs_inode_t *
+xfs_bhvtoi(bhv_desc_t *bhvp)
+{
+	return XFS_BHVTOI(bhvp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BHVTOM)
+xfs_mount_t *
+xfs_bhvtom(bhv_desc_t *bdp)
+{
+	return XFS_BHVTOM(bdp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BM_MAXLEVELS)
+int
+xfs_bm_maxlevels(xfs_mount_t *mp, int w)
+{
+	return XFS_BM_MAXLEVELS(mp, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_DMAXRECS)
+int
+xfs_bmap_block_dmaxrecs(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_BLOCK_DMAXRECS(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_DMINRECS)
+int
+xfs_bmap_block_dminrecs(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_BLOCK_DMINRECS(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_DSIZE)
+int
+xfs_bmap_block_dsize(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_BLOCK_DSIZE(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_IMAXRECS)
+int
+xfs_bmap_block_imaxrecs(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_BLOCK_IMAXRECS(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_IMINRECS)
+int
+xfs_bmap_block_iminrecs(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_BLOCK_IMINRECS(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BLOCK_ISIZE)
+int
+xfs_bmap_block_isize(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_BLOCK_ISIZE(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_KEY_ADDR)
+/*ARGSUSED3*/
+xfs_bmbt_key_t *
+xfs_bmap_broot_key_addr(xfs_bmbt_block_t *bb, int i, int sz)
+{
+	return XFS_BMAP_BROOT_KEY_ADDR(bb, i, sz);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_MAXRECS)
+int
+xfs_bmap_broot_maxrecs(int sz)
+{
+	return XFS_BMAP_BROOT_MAXRECS(sz);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_NUMRECS)
+int
+xfs_bmap_broot_numrecs(xfs_bmdr_block_t *bb)
+{
+	return XFS_BMAP_BROOT_NUMRECS(bb);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_PTR_ADDR)
+xfs_bmbt_ptr_t *
+xfs_bmap_broot_ptr_addr(xfs_bmbt_block_t *bb, int i, int sz)
+{
+	return XFS_BMAP_BROOT_PTR_ADDR(bb, i, sz);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_REC_ADDR)
+/*ARGSUSED3*/
+xfs_bmbt_rec_t *
+xfs_bmap_broot_rec_addr(xfs_bmbt_block_t *bb, int i, int sz)
+{
+	return XFS_BMAP_BROOT_REC_ADDR(bb, i, sz);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_SPACE)
+int
+xfs_bmap_broot_space(xfs_bmdr_block_t *bb)
+{
+	return XFS_BMAP_BROOT_SPACE(bb);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_BROOT_SPACE_CALC)
+int
+xfs_bmap_broot_space_calc(int nrecs)
+{
+	return XFS_BMAP_BROOT_SPACE_CALC(nrecs);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_IBLOCK_SIZE)
+/*ARGSUSED1*/
+int
+xfs_bmap_iblock_size(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_IBLOCK_SIZE(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_INIT)
+void
+xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
+{
+	XFS_BMAP_INIT(flp, fbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_KEY_DADDR)
+/*ARGSUSED3*/
+xfs_bmbt_key_t *
+xfs_bmap_key_daddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_KEY_DADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_KEY_IADDR)
+/*ARGSUSED3*/
+xfs_bmbt_key_t *
+xfs_bmap_key_iaddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_KEY_IADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_PTR_DADDR)
+xfs_bmbt_ptr_t *
+xfs_bmap_ptr_daddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_PTR_DADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_PTR_IADDR)
+xfs_bmbt_ptr_t *
+xfs_bmap_ptr_iaddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_PTR_IADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_RBLOCK_DSIZE)
+/*ARGSUSED1*/
+int
+xfs_bmap_rblock_dsize(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_RBLOCK_DSIZE(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_RBLOCK_ISIZE)
+/*ARGSUSED1*/
+int
+xfs_bmap_rblock_isize(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_RBLOCK_ISIZE(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_REC_DADDR)
+/*ARGSUSED3*/
+xfs_bmbt_rec_t *
+xfs_bmap_rec_daddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_REC_DADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_REC_IADDR)
+/*ARGSUSED3*/
+xfs_bmbt_rec_t *
+xfs_bmap_rec_iaddr(xfs_bmbt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_BMAP_REC_IADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAP_SANITY_CHECK)
+int
+xfs_bmap_sanity_check(xfs_mount_t *mp, xfs_bmbt_block_t *bb, int level)
+{
+	return XFS_BMAP_SANITY_CHECK(mp, bb, level);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMAPI_AFLAG)
+int
+xfs_bmapi_aflag(int w)
+{
+	return XFS_BMAPI_AFLAG(w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BMDR_SPACE_CALC)
+int
+xfs_bmdr_space_calc(int nrecs)
+{
+	return XFS_BMDR_SPACE_CALC(nrecs);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BNO_BLOCK)
+xfs_agblock_t
+xfs_bno_block(xfs_mount_t *mp)
+{
+	return XFS_BNO_BLOCK(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BTREE_LONG_PTRS)
+int
+xfs_btree_long_ptrs(xfs_btnum_t btnum)
+{
+	return XFS_BTREE_LONG_PTRS(btnum);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_AGF)
+xfs_agf_t *
+xfs_buf_to_agf(xfs_buf_t *bp)
+{
+	return XFS_BUF_TO_AGF(bp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_AGFL)
+xfs_agfl_t *
+xfs_buf_to_agfl(xfs_buf_t *bp)
+{
+	return XFS_BUF_TO_AGFL(bp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_AGI)
+xfs_agi_t *
+xfs_buf_to_agi(xfs_buf_t *bp)
+{
+	return XFS_BUF_TO_AGI(bp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_ALLOC_BLOCK)
+xfs_alloc_block_t *
+xfs_buf_to_alloc_block(xfs_buf_t *bp)
+{
+	return XFS_BUF_TO_ALLOC_BLOCK(bp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_BLOCK)
+xfs_btree_block_t *
+xfs_buf_to_block(xfs_buf_t *bp)
+{
+	return XFS_BUF_TO_BLOCK(bp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_BMBT_BLOCK)
+xfs_bmbt_block_t *
+xfs_buf_to_bmbt_block(xfs_buf_t *bp)
+{
+	return XFS_BUF_TO_BMBT_BLOCK(bp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_DINODE)
+xfs_dinode_t *
+xfs_buf_to_dinode(xfs_buf_t *bp)
+{
+	return XFS_BUF_TO_DINODE(bp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_INOBT_BLOCK)
+xfs_inobt_block_t *
+xfs_buf_to_inobt_block(xfs_buf_t *bp)
+{
+	return XFS_BUF_TO_INOBT_BLOCK(bp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_LBLOCK)
+xfs_btree_lblock_t *
+xfs_buf_to_lblock(xfs_buf_t *bp)
+{
+	return XFS_BUF_TO_LBLOCK(bp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_SBLOCK)
+xfs_btree_sblock_t *
+xfs_buf_to_sblock(xfs_buf_t *bp)
+{
+	return XFS_BUF_TO_SBLOCK(bp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_BUF_TO_SBP)
+xfs_sb_t *
+xfs_buf_to_sbp(xfs_buf_t *bp)
+{
+	return XFS_BUF_TO_SBP(bp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_ASIZE)
+int
+xfs_cfork_asize_arch(xfs_dinode_core_t *dcp, xfs_mount_t *mp, xfs_arch_t arch)
+{
+	return XFS_CFORK_ASIZE_ARCH(dcp, mp, arch);
+}
+int
+xfs_cfork_asize(xfs_dinode_core_t *dcp, xfs_mount_t *mp)
+{
+	return XFS_CFORK_ASIZE(dcp, mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_BOFF)
+int
+xfs_cfork_boff_arch(xfs_dinode_core_t *dcp, xfs_arch_t arch)
+{
+	return XFS_CFORK_BOFF_ARCH(dcp, arch);
+}
+int
+xfs_cfork_boff(xfs_dinode_core_t *dcp)
+{
+	return XFS_CFORK_BOFF(dcp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_DSIZE)
+int
+xfs_cfork_dsize_arch(xfs_dinode_core_t *dcp, xfs_mount_t *mp, xfs_arch_t arch)
+{
+	return XFS_CFORK_DSIZE_ARCH(dcp, mp, arch);
+}
+int
+xfs_cfork_dsize(xfs_dinode_core_t *dcp, xfs_mount_t *mp)
+{
+	return XFS_CFORK_DSIZE(dcp, mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_FMT_SET)
+void
+xfs_cfork_fmt_set_arch(xfs_dinode_core_t *dcp, int w, int n, xfs_arch_t arch)
+{
+	XFS_CFORK_FMT_SET_ARCH(dcp, w, n, arch);
+}
+void
+xfs_cfork_fmt_set(xfs_dinode_core_t *dcp, int w, int n)
+{
+	XFS_CFORK_FMT_SET(dcp, w, n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_FORMAT)
+int
+xfs_cfork_format_arch(xfs_dinode_core_t *dcp, int w, xfs_arch_t arch)
+{
+	return XFS_CFORK_FORMAT_ARCH(dcp, w, arch);
+}
+int
+xfs_cfork_format(xfs_dinode_core_t *dcp, int w)
+{
+	return XFS_CFORK_FORMAT(dcp, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_NEXT_SET)
+void
+xfs_cfork_next_set_arch(xfs_dinode_core_t *dcp, int w, int n, xfs_arch_t arch)
+{
+	XFS_CFORK_NEXT_SET_ARCH(dcp, w, n, arch);
+}
+void
+xfs_cfork_next_set(xfs_dinode_core_t *dcp, int w, int n)
+{
+	XFS_CFORK_NEXT_SET(dcp, w, n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_NEXTENTS)
+int
+xfs_cfork_nextents_arch(xfs_dinode_core_t *dcp, int w, xfs_arch_t arch)
+{
+	return XFS_CFORK_NEXTENTS_ARCH(dcp, w, arch);
+}
+int
+xfs_cfork_nextents(xfs_dinode_core_t *dcp, int w)
+{
+	return XFS_CFORK_NEXTENTS(dcp, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_Q)
+int
+xfs_cfork_q_arch(xfs_dinode_core_t *dcp, xfs_arch_t arch)
+{
+	return XFS_CFORK_Q_ARCH(dcp, arch);
+}
+int
+xfs_cfork_q(xfs_dinode_core_t *dcp)
+{
+	return XFS_CFORK_Q(dcp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CFORK_SIZE)
+int
+xfs_cfork_size_arch(xfs_dinode_core_t *dcp, xfs_mount_t *mp, int w, xfs_arch_t arch)
+{
+	return XFS_CFORK_SIZE_ARCH(dcp, mp, w, arch);
+}
+int
+xfs_cfork_size(xfs_dinode_core_t *dcp, xfs_mount_t *mp, int w)
+{
+	return XFS_CFORK_SIZE(dcp, mp, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_CNT_BLOCK)
+xfs_agblock_t
+xfs_cnt_block(xfs_mount_t *mp)
+{
+	return XFS_CNT_BLOCK(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DA_COOKIE_BNO)
+xfs_dablk_t
+xfs_da_cookie_bno(xfs_mount_t *mp, xfs_off_t cookie)
+{
+	return XFS_DA_COOKIE_BNO(mp, cookie);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DA_COOKIE_ENTRY)
+int
+xfs_da_cookie_entry(xfs_mount_t *mp, xfs_off_t cookie)
+{
+	return XFS_DA_COOKIE_ENTRY(mp, cookie);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DA_COOKIE_HASH)
+/*ARGSUSED1*/
+xfs_dahash_t
+xfs_da_cookie_hash(xfs_mount_t *mp, xfs_off_t cookie)
+{
+	return XFS_DA_COOKIE_HASH(mp, cookie);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DA_MAKE_BNOENTRY)
+__uint32_t
+xfs_da_make_bnoentry(xfs_mount_t *mp, xfs_dablk_t bno, int entry)
+{
+	return XFS_DA_MAKE_BNOENTRY(mp, bno, entry);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DA_MAKE_COOKIE)
+xfs_off_t
+xfs_da_make_cookie(xfs_mount_t *mp, xfs_dablk_t bno, int entry,
+		   xfs_dahash_t hash)
+{
+	return XFS_DA_MAKE_COOKIE(mp, bno, entry, hash);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DA_NODE_ENTRIES)
+int
+xfs_da_node_entries(xfs_mount_t *mp)
+{
+	return XFS_DA_NODE_ENTRIES(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DADDR_TO_AGBNO)
+xfs_agblock_t
+xfs_daddr_to_agbno(xfs_mount_t *mp, xfs_daddr_t d)
+{
+	return XFS_DADDR_TO_AGBNO(mp, d);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DADDR_TO_AGNO)
+xfs_agnumber_t
+xfs_daddr_to_agno(xfs_mount_t *mp, xfs_daddr_t d)
+{
+	return XFS_DADDR_TO_AGNO(mp, d);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DADDR_TO_FSB)
+xfs_fsblock_t
+xfs_daddr_to_fsb(xfs_mount_t *mp, xfs_daddr_t d)
+{
+	return XFS_DADDR_TO_FSB(mp, d);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_APTR)
+char *
+xfs_dfork_aptr_arch(xfs_dinode_t *dip, xfs_arch_t arch)
+{
+	return XFS_DFORK_APTR_ARCH(dip, arch);
+}
+char *
+xfs_dfork_aptr(xfs_dinode_t *dip)
+{
+	return XFS_DFORK_APTR(dip);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_ASIZE)
+int
+xfs_dfork_asize_arch(xfs_dinode_t *dip, xfs_mount_t *mp, xfs_arch_t arch)
+{
+	return XFS_DFORK_ASIZE_ARCH(dip, mp, arch);
+}
+int
+xfs_dfork_asize(xfs_dinode_t *dip, xfs_mount_t *mp)
+{
+	return XFS_DFORK_ASIZE(dip, mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_BOFF)
+int
+xfs_dfork_boff_arch(xfs_dinode_t *dip, xfs_arch_t arch)
+{
+	return XFS_DFORK_BOFF_ARCH(dip, arch);
+}
+int
+xfs_dfork_boff(xfs_dinode_t *dip)
+{
+	return XFS_DFORK_BOFF(dip);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_DPTR)
+char *
+xfs_dfork_dptr_arch(xfs_dinode_t *dip, xfs_arch_t arch)
+{
+	return XFS_DFORK_DPTR_ARCH(dip, arch);
+}
+char *
+xfs_dfork_dptr(xfs_dinode_t *dip)
+{
+	return XFS_DFORK_DPTR(dip);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_DSIZE)
+int
+xfs_dfork_dsize_arch(xfs_dinode_t *dip, xfs_mount_t *mp, xfs_arch_t arch)
+{
+	return XFS_DFORK_DSIZE_ARCH(dip, mp, arch);
+}
+int
+xfs_dfork_dsize(xfs_dinode_t *dip, xfs_mount_t *mp)
+{
+	return XFS_DFORK_DSIZE(dip, mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_FMT_SET)
+void
+xfs_dfork_fmt_set_arch(xfs_dinode_t *dip, int w, int n, xfs_arch_t arch)
+{
+	XFS_DFORK_FMT_SET_ARCH(dip, w, n, arch);
+}
+void
+xfs_dfork_fmt_set(xfs_dinode_t *dip, int w, int n)
+{
+	XFS_DFORK_FMT_SET(dip, w, n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_FORMAT)
+int
+xfs_dfork_format_arch(xfs_dinode_t *dip, int w, xfs_arch_t arch)
+{
+	return XFS_DFORK_FORMAT_ARCH(dip, w, arch);
+}
+int
+xfs_dfork_format(xfs_dinode_t *dip, int w)
+{
+	return XFS_DFORK_FORMAT(dip, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_NEXT_SET)
+void
+xfs_dfork_next_set_arch(xfs_dinode_t *dip, int w, int n, xfs_arch_t arch)
+{
+	XFS_DFORK_NEXT_SET_ARCH(dip, w, n, arch);
+}
+void
+xfs_dfork_next_set(xfs_dinode_t *dip, int w, int n)
+{
+	XFS_DFORK_NEXT_SET(dip, w, n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_NEXTENTS)
+int
+xfs_dfork_nextents_arch(xfs_dinode_t *dip, int w, xfs_arch_t arch)
+{
+	return XFS_DFORK_NEXTENTS_ARCH(dip, w, arch);
+}
+int
+xfs_dfork_nextents(xfs_dinode_t *dip, int w)
+{
+	return XFS_DFORK_NEXTENTS(dip, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_PTR)
+char *
+xfs_dfork_ptr_arch(xfs_dinode_t *dip, int w, xfs_arch_t arch)
+{
+	return XFS_DFORK_PTR_ARCH(dip, w, arch);
+}
+char *
+xfs_dfork_ptr(xfs_dinode_t *dip, int w)
+{
+	return XFS_DFORK_PTR(dip, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_Q)
+int
+xfs_dfork_q_arch(xfs_dinode_t *dip, xfs_arch_t arch)
+{
+	return XFS_DFORK_Q_ARCH(dip, arch);
+}
+int
+xfs_dfork_q(xfs_dinode_t *dip)
+{
+	return XFS_DFORK_Q(dip);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DFORK_SIZE)
+int
+xfs_dfork_size_arch(xfs_dinode_t *dip, xfs_mount_t *mp, int w, xfs_arch_t arch)
+{
+	return XFS_DFORK_SIZE_ARCH(dip, mp, w, arch);
+}
+int
+xfs_dfork_size(xfs_dinode_t *dip, xfs_mount_t *mp, int w)
+{
+	return XFS_DFORK_SIZE(dip, mp, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DINODE_GOOD_VERSION)
+int
+xfs_dinode_good_version(int v)
+{
+	return XFS_DINODE_GOOD_VERSION(v);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_LEAF_ENTSIZE_BYENTRY)
+int
+xfs_dir_leaf_entsize_byentry(xfs_dir_leaf_entry_t *entry)
+{
+	return XFS_DIR_LEAF_ENTSIZE_BYENTRY(entry);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_LEAF_ENTSIZE_BYNAME)
+int
+xfs_dir_leaf_entsize_byname(int len)
+{
+	return XFS_DIR_LEAF_ENTSIZE_BYNAME(len);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_LEAF_NAMESTRUCT)
+xfs_dir_leaf_name_t *
+xfs_dir_leaf_namestruct(xfs_dir_leafblock_t *leafp, int offset)
+{
+	return XFS_DIR_LEAF_NAMESTRUCT(leafp, offset);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_ALLFIT)
+int
+xfs_dir_sf_allfit(int count, int totallen)
+{
+	return XFS_DIR_SF_ALLFIT(count, totallen);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_ENTSIZE_BYENTRY)
+int
+xfs_dir_sf_entsize_byentry(xfs_dir_sf_entry_t *sfep)
+{
+	return XFS_DIR_SF_ENTSIZE_BYENTRY(sfep);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_ENTSIZE_BYNAME)
+int
+xfs_dir_sf_entsize_byname(int len)
+{
+	return XFS_DIR_SF_ENTSIZE_BYNAME(len);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_GET_DIRINO)
+void
+xfs_dir_sf_get_dirino_arch(xfs_dir_ino_t *from, xfs_ino_t *to, xfs_arch_t arch)
+{
+	XFS_DIR_SF_GET_DIRINO_ARCH(from, to, arch);
+}
+void
+xfs_dir_sf_get_dirino(xfs_dir_ino_t *from, xfs_ino_t *to)
+{
+	XFS_DIR_SF_GET_DIRINO(from, to);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_NEXTENTRY)
+xfs_dir_sf_entry_t *
+xfs_dir_sf_nextentry(xfs_dir_sf_entry_t *sfep)
+{
+	return XFS_DIR_SF_NEXTENTRY(sfep);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR_SF_PUT_DIRINO)
+void
+xfs_dir_sf_put_dirino_arch(xfs_ino_t *from, xfs_dir_ino_t *to, xfs_arch_t arch)
+{
+	XFS_DIR_SF_PUT_DIRINO_ARCH(from, to, arch);
+}
+void
+xfs_dir_sf_put_dirino(xfs_ino_t *from, xfs_dir_ino_t *to)
+{
+	XFS_DIR_SF_PUT_DIRINO(from, to);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BLOCK_LEAF_P)
+xfs_dir2_leaf_entry_t *
+xfs_dir2_block_leaf_p_arch(xfs_dir2_block_tail_t *btp, xfs_arch_t arch)
+{
+	return XFS_DIR2_BLOCK_LEAF_P_ARCH(btp,arch);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BLOCK_TAIL_P)
+xfs_dir2_block_tail_t *
+xfs_dir2_block_tail_p(xfs_mount_t *mp, xfs_dir2_block_t *block)
+{
+	return XFS_DIR2_BLOCK_TAIL_P(mp, block);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BYTE_TO_DA)
+xfs_dablk_t
+xfs_dir2_byte_to_da(xfs_mount_t *mp, xfs_dir2_off_t by)
+{
+	return XFS_DIR2_BYTE_TO_DA(mp, by);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BYTE_TO_DATAPTR)
+/* ARGSUSED */
+xfs_dir2_dataptr_t
+xfs_dir2_byte_to_dataptr(xfs_mount_t *mp, xfs_dir2_off_t by)
+{
+	return XFS_DIR2_BYTE_TO_DATAPTR(mp, by);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BYTE_TO_DB)
+xfs_dir2_db_t
+xfs_dir2_byte_to_db(xfs_mount_t *mp, xfs_dir2_off_t by)
+{
+	return XFS_DIR2_BYTE_TO_DB(mp, by);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_BYTE_TO_OFF)
+xfs_dir2_data_aoff_t
+xfs_dir2_byte_to_off(xfs_mount_t *mp, xfs_dir2_off_t by)
+{
+	return XFS_DIR2_BYTE_TO_OFF(mp, by);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DA_TO_BYTE)
+xfs_dir2_off_t
+xfs_dir2_da_to_byte(xfs_mount_t *mp, xfs_dablk_t da)
+{
+	return XFS_DIR2_DA_TO_BYTE(mp, da);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DA_TO_DB)
+xfs_dir2_db_t
+xfs_dir2_da_to_db(xfs_mount_t *mp, xfs_dablk_t da)
+{
+	return XFS_DIR2_DA_TO_DB(mp, da);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATA_ENTRY_TAG_P)
+xfs_dir2_data_off_t *
+xfs_dir2_data_entry_tag_p(xfs_dir2_data_entry_t *dep)
+{
+	return XFS_DIR2_DATA_ENTRY_TAG_P(dep);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATA_ENTSIZE)
+int
+xfs_dir2_data_entsize(int n)
+{
+	return XFS_DIR2_DATA_ENTSIZE(n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATA_UNUSED_TAG_P)
+xfs_dir2_data_off_t *
+xfs_dir2_data_unused_tag_p_arch(xfs_dir2_data_unused_t *dup, xfs_arch_t arch)
+{
+	return XFS_DIR2_DATA_UNUSED_TAG_P_ARCH(dup,arch);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATAPTR_TO_BYTE)
+/* ARGSUSED */
+xfs_dir2_off_t
+xfs_dir2_dataptr_to_byte(xfs_mount_t *mp, xfs_dir2_dataptr_t dp)
+{
+	return XFS_DIR2_DATAPTR_TO_BYTE(mp, dp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATAPTR_TO_DB)
+xfs_dir2_db_t
+xfs_dir2_dataptr_to_db(xfs_mount_t *mp, xfs_dir2_dataptr_t dp)
+{
+	return XFS_DIR2_DATAPTR_TO_DB(mp, dp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DATAPTR_TO_OFF)
+xfs_dir2_data_aoff_t
+xfs_dir2_dataptr_to_off(xfs_mount_t *mp, xfs_dir2_dataptr_t dp)
+{
+	return XFS_DIR2_DATAPTR_TO_OFF(mp, dp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DB_OFF_TO_BYTE)
+xfs_dir2_off_t
+xfs_dir2_db_off_to_byte(xfs_mount_t *mp, xfs_dir2_db_t db,
+			xfs_dir2_data_aoff_t o)
+{
+	return XFS_DIR2_DB_OFF_TO_BYTE(mp, db, o);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DB_OFF_TO_DATAPTR)
+xfs_dir2_dataptr_t
+xfs_dir2_db_off_to_dataptr(xfs_mount_t *mp, xfs_dir2_db_t db,
+			   xfs_dir2_data_aoff_t o)
+{
+	return XFS_DIR2_DB_OFF_TO_DATAPTR(mp, db, o);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DB_TO_DA)
+xfs_dablk_t
+xfs_dir2_db_to_da(xfs_mount_t *mp, xfs_dir2_db_t db)
+{
+	return XFS_DIR2_DB_TO_DA(mp, db);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DB_TO_FDB)
+xfs_dir2_db_t
+xfs_dir2_db_to_fdb(xfs_mount_t *mp, xfs_dir2_db_t db)
+{
+	return XFS_DIR2_DB_TO_FDB(mp, db);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_DB_TO_FDINDEX)
+int
+xfs_dir2_db_to_fdindex(xfs_mount_t *mp, xfs_dir2_db_t db)
+{
+	return XFS_DIR2_DB_TO_FDINDEX(mp, db);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_LEAF_BESTS_P)
+xfs_dir2_data_off_t *
+xfs_dir2_leaf_bests_p_arch(xfs_dir2_leaf_tail_t *ltp, xfs_arch_t arch)
+{
+	return XFS_DIR2_LEAF_BESTS_P_ARCH(ltp, arch);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_LEAF_TAIL_P)
+xfs_dir2_leaf_tail_t *
+xfs_dir2_leaf_tail_p(xfs_mount_t *mp, xfs_dir2_leaf_t *lp)
+{
+	return XFS_DIR2_LEAF_TAIL_P(mp, lp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_MAX_LEAF_ENTS)
+int
+xfs_dir2_max_leaf_ents(xfs_mount_t *mp)
+{
+	return XFS_DIR2_MAX_LEAF_ENTS(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_ENTSIZE_BYENTRY)
+int
+xfs_dir2_sf_entsize_byentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep)
+{
+	return XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp, sfep);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_FIRSTENTRY)
+xfs_dir2_sf_entry_t *
+xfs_dir2_sf_firstentry(xfs_dir2_sf_t *sfp)
+{
+	return XFS_DIR2_SF_FIRSTENTRY(sfp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_ENTSIZE_BYNAME)
+int
+xfs_dir2_sf_entsize_byname(xfs_dir2_sf_t *sfp, int len)
+{
+	return XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, len);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_GET_INUMBER)
+xfs_intino_t
+xfs_dir2_sf_get_inumber_arch(xfs_dir2_sf_t *sfp, xfs_dir2_inou_t *from, xfs_arch_t arch)
+{
+	return XFS_DIR2_SF_GET_INUMBER_ARCH(sfp, from, arch);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_GET_OFFSET)
+xfs_dir2_data_aoff_t
+xfs_dir2_sf_get_offset_arch(xfs_dir2_sf_entry_t *sfep, xfs_arch_t arch)
+{
+	return XFS_DIR2_SF_GET_OFFSET_ARCH(sfep, arch);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_HDR_SIZE)
+int
+xfs_dir2_sf_hdr_size(int i8count)
+{
+	return XFS_DIR2_SF_HDR_SIZE(i8count);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_INUMBERP)
+xfs_dir2_inou_t *
+xfs_dir2_sf_inumberp(xfs_dir2_sf_entry_t *sfep)
+{
+	return XFS_DIR2_SF_INUMBERP(sfep);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_NEXTENTRY)
+xfs_dir2_sf_entry_t *
+xfs_dir2_sf_nextentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep)
+{
+	return XFS_DIR2_SF_NEXTENTRY(sfp, sfep);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_PUT_INUMBER)
+void
+xfs_dir2_sf_put_inumber_arch(xfs_dir2_sf_t *sfp, xfs_ino_t *from, xfs_dir2_inou_t *to, xfs_arch_t arch)
+{
+	XFS_DIR2_SF_PUT_INUMBER_ARCH(sfp, from, to, arch);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_DIR2_SF_PUT_OFFSET)
+void
+xfs_dir2_sf_put_offset_arch(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off, xfs_arch_t arch)
+{
+	XFS_DIR2_SF_PUT_OFFSET_ARCH(sfep, off, arch);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_EXTFMT_INODE )
+xfs_exntfmt_t
+xfs_extfmt_inode(struct xfs_inode *ip)
+{
+	return XFS_EXTFMT_INODE(ip);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_EXTLEN_MAX)
+xfs_extlen_t
+xfs_extlen_max(xfs_extlen_t a, xfs_extlen_t b)
+{
+	return XFS_EXTLEN_MAX(a, b);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_EXTLEN_MIN)
+xfs_extlen_t
+xfs_extlen_min(xfs_extlen_t a, xfs_extlen_t b)
+{
+	return XFS_EXTLEN_MIN(a, b);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FILBLKS_MAX)
+xfs_filblks_t
+xfs_filblks_max(xfs_filblks_t a, xfs_filblks_t b)
+{
+	return XFS_FILBLKS_MAX(a, b);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FILBLKS_MIN)
+xfs_filblks_t
+xfs_filblks_min(xfs_filblks_t a, xfs_filblks_t b)
+{
+	return XFS_FILBLKS_MIN(a, b);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FILEOFF_MAX)
+xfs_fileoff_t
+xfs_fileoff_max(xfs_fileoff_t a, xfs_fileoff_t b)
+{
+	return XFS_FILEOFF_MAX(a, b);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FILEOFF_MIN)
+xfs_fileoff_t
+xfs_fileoff_min(xfs_fileoff_t a, xfs_fileoff_t b)
+{
+	return XFS_FILEOFF_MIN(a, b);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FSB_SANITY_CHECK)
+int
+xfs_fsb_sanity_check(xfs_mount_t *mp, xfs_fsblock_t fsbno)
+{
+	return XFS_FSB_SANITY_CHECK(mp, fsbno);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FSB_TO_AGBNO)
+xfs_agblock_t
+xfs_fsb_to_agbno(xfs_mount_t *mp, xfs_fsblock_t fsbno)
+{
+	return XFS_FSB_TO_AGBNO(mp, fsbno);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FSB_TO_AGNO)
+xfs_agnumber_t
+xfs_fsb_to_agno(xfs_mount_t *mp, xfs_fsblock_t fsbno)
+{
+	return XFS_FSB_TO_AGNO(mp, fsbno);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FSB_TO_DADDR)
+xfs_daddr_t
+xfs_fsb_to_daddr(xfs_mount_t *mp, xfs_fsblock_t fsbno)
+{
+	return XFS_FSB_TO_DADDR(mp, fsbno);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_FSB_TO_DB)
+xfs_daddr_t
+xfs_fsb_to_db(xfs_inode_t *ip, xfs_fsblock_t fsb)
+{
+	return XFS_FSB_TO_DB(ip, fsb);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_HDR_BLOCK)
+xfs_agblock_t
+xfs_hdr_block(xfs_mount_t *mp, xfs_daddr_t d)
+{
+	return XFS_HDR_BLOCK(mp, d);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IALLOC_BLOCKS)
+xfs_extlen_t
+xfs_ialloc_blocks(xfs_mount_t *mp)
+{
+	return XFS_IALLOC_BLOCKS(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IALLOC_FIND_FREE)
+int
+xfs_ialloc_find_free(xfs_inofree_t *fp)
+{
+	return XFS_IALLOC_FIND_FREE(fp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IALLOC_INODES)
+int
+xfs_ialloc_inodes(xfs_mount_t *mp)
+{
+	return XFS_IALLOC_INODES(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IBT_BLOCK)
+xfs_agblock_t
+xfs_ibt_block(xfs_mount_t *mp)
+{
+	return XFS_IBT_BLOCK(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_ASIZE)
+int
+xfs_ifork_asize(xfs_inode_t *ip)
+{
+	return XFS_IFORK_ASIZE(ip);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_DSIZE)
+int
+xfs_ifork_dsize(xfs_inode_t *ip)
+{
+	return XFS_IFORK_DSIZE(ip);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_FMT_SET)
+void
+xfs_ifork_fmt_set(xfs_inode_t *ip, int w, int n)
+{
+	XFS_IFORK_FMT_SET(ip, w, n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_FORMAT)
+int
+xfs_ifork_format(xfs_inode_t *ip, int w)
+{
+	return XFS_IFORK_FORMAT(ip, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_NEXT_SET)
+void
+xfs_ifork_next_set(xfs_inode_t *ip, int w, int n)
+{
+	XFS_IFORK_NEXT_SET(ip, w, n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_NEXTENTS)
+int
+xfs_ifork_nextents(xfs_inode_t *ip, int w)
+{
+	return XFS_IFORK_NEXTENTS(ip, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_PTR)
+xfs_ifork_t *
+xfs_ifork_ptr(xfs_inode_t *ip, int w)
+{
+	return XFS_IFORK_PTR(ip, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_Q)
+int
+xfs_ifork_q(xfs_inode_t *ip)
+{
+	return XFS_IFORK_Q(ip);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IFORK_SIZE)
+int
+xfs_ifork_size(xfs_inode_t *ip, int w)
+{
+	return XFS_IFORK_SIZE(ip, w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ILOG_FBROOT)
+int
+xfs_ilog_fbroot(int w)
+{
+	return XFS_ILOG_FBROOT(w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ILOG_FDATA)
+int
+xfs_ilog_fdata(int w)
+{
+	return XFS_ILOG_FDATA(w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ILOG_FEXT)
+int
+xfs_ilog_fext(int w)
+{
+	return XFS_ILOG_FEXT(w);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_IN_MAXLEVELS)
+int
+xfs_in_maxlevels(xfs_mount_t *mp)
+{
+	return XFS_IN_MAXLEVELS(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_AGBNO_BITS)
+int
+xfs_ino_agbno_bits(xfs_mount_t *mp)
+{
+	return XFS_INO_AGBNO_BITS(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_AGINO_BITS)
+int
+xfs_ino_agino_bits(xfs_mount_t *mp)
+{
+	return XFS_INO_AGINO_BITS(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_AGNO_BITS)
+int
+xfs_ino_agno_bits(xfs_mount_t *mp)
+{
+	return XFS_INO_AGNO_BITS(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_BITS)
+int
+xfs_ino_bits(xfs_mount_t *mp)
+{
+	return XFS_INO_BITS(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_MASK)
+__uint32_t
+xfs_ino_mask(int k)
+{
+	return XFS_INO_MASK(k);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_OFFSET_BITS)
+int
+xfs_ino_offset_bits(xfs_mount_t *mp)
+{
+	return XFS_INO_OFFSET_BITS(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_TO_AGBNO)
+xfs_agblock_t
+xfs_ino_to_agbno(xfs_mount_t *mp, xfs_ino_t i)
+{
+	return XFS_INO_TO_AGBNO(mp, i);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_TO_AGINO)
+xfs_agino_t
+xfs_ino_to_agino(xfs_mount_t *mp, xfs_ino_t i)
+{
+	return XFS_INO_TO_AGINO(mp, i);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_TO_AGNO)
+xfs_agnumber_t
+xfs_ino_to_agno(xfs_mount_t *mp, xfs_ino_t i)
+{
+	return XFS_INO_TO_AGNO(mp, i);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_TO_FSB)
+xfs_fsblock_t
+xfs_ino_to_fsb(xfs_mount_t *mp, xfs_ino_t i)
+{
+	return XFS_INO_TO_FSB(mp, i);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INO_TO_OFFSET)
+int
+xfs_ino_to_offset(xfs_mount_t *mp, xfs_ino_t i)
+{
+	return XFS_INO_TO_OFFSET(mp, i);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_BLOCK_MAXRECS)
+int
+xfs_inobt_block_maxrecs(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_INOBT_BLOCK_MAXRECS(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_BLOCK_MINRECS)
+int
+xfs_inobt_block_minrecs(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_INOBT_BLOCK_MINRECS(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_BLOCK_SIZE)
+/*ARGSUSED1*/
+int
+xfs_inobt_block_size(int lev, xfs_btree_cur_t *cur)
+{
+	return XFS_INOBT_BLOCK_SIZE(lev, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_CLR_FREE)
+void
+xfs_inobt_clr_free(xfs_inobt_rec_t *rp, int i, xfs_arch_t arch)
+{
+	XFS_INOBT_CLR_FREE(rp, i, arch);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_IS_FREE)
+int
+xfs_inobt_is_free(xfs_inobt_rec_t *rp, int i, xfs_arch_t arch)
+{
+	return XFS_INOBT_IS_FREE(rp, i, arch);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_IS_LAST_REC)
+int
+xfs_inobt_is_last_rec(xfs_btree_cur_t *cur)
+{
+	return XFS_INOBT_IS_LAST_REC(cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_KEY_ADDR)
+/*ARGSUSED3*/
+xfs_inobt_key_t *
+xfs_inobt_key_addr(xfs_inobt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_INOBT_KEY_ADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_MASK)
+xfs_inofree_t
+xfs_inobt_mask(int i)
+{
+	return XFS_INOBT_MASK(i);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_MASKN)
+xfs_inofree_t
+xfs_inobt_maskn(int i, int n)
+{
+	return XFS_INOBT_MASKN(i, n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_PTR_ADDR)
+xfs_inobt_ptr_t *
+xfs_inobt_ptr_addr(xfs_inobt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_INOBT_PTR_ADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_REC_ADDR)
+/*ARGSUSED3*/
+xfs_inobt_rec_t *
+xfs_inobt_rec_addr(xfs_inobt_block_t *bb, int i, xfs_btree_cur_t *cur)
+{
+	return XFS_INOBT_REC_ADDR(bb, i, cur);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_SET_FREE)
+void
+xfs_inobt_set_free(xfs_inobt_rec_t *rp, int i, xfs_arch_t arch)
+{
+	XFS_INOBT_SET_FREE(rp, i, arch);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ITOBHV)
+bhv_desc_t *
+xfs_itobhv(xfs_inode_t *ip)
+{
+	return XFS_ITOBHV(ip);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_ITOV)
+vnode_t *
+xfs_itov(xfs_inode_t *ip)
+{
+	return XFS_ITOV(ip);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LBLOG)
+int
+xfs_lblog(xfs_mount_t *mp)
+{
+	return XFS_LBLOG(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LBSIZE)
+int
+xfs_lbsize(xfs_mount_t *mp)
+{
+	return XFS_LBSIZE(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_ALL_FREE)
+void
+xfs_lic_all_free(xfs_log_item_chunk_t *cp)
+{
+	XFS_LIC_ALL_FREE(cp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_ARE_ALL_FREE)
+int
+xfs_lic_are_all_free(xfs_log_item_chunk_t *cp)
+{
+	return XFS_LIC_ARE_ALL_FREE(cp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_CLAIM)
+void
+xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot)
+{
+	XFS_LIC_CLAIM(cp, slot);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_DESC_TO_CHUNK)
+xfs_log_item_chunk_t *
+xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
+{
+	return XFS_LIC_DESC_TO_CHUNK(dp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_DESC_TO_SLOT)
+int
+xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp)
+{
+	return XFS_LIC_DESC_TO_SLOT(dp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_INIT)
+void
+xfs_lic_init(xfs_log_item_chunk_t *cp)
+{
+	XFS_LIC_INIT(cp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_INIT_SLOT)
+void
+xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot)
+{
+	XFS_LIC_INIT_SLOT(cp, slot);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_ISFREE)
+int
+xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot)
+{
+	return XFS_LIC_ISFREE(cp, slot);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_RELSE)
+void
+xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot)
+{
+	XFS_LIC_RELSE(cp, slot);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_SLOT)
+xfs_log_item_desc_t *
+xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot)
+{
+	return XFS_LIC_SLOT(cp, slot);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LIC_VACANCY)
+int
+xfs_lic_vacancy(xfs_log_item_chunk_t *cp)
+{
+	return XFS_LIC_VACANCY(cp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_LITINO)
+int
+xfs_litino(xfs_mount_t *mp)
+{
+	return XFS_LITINO(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MAKE_IPTR)
+xfs_dinode_t *
+xfs_make_iptr(xfs_mount_t *mp, xfs_buf_t *b, int o)
+{
+	return XFS_MAKE_IPTR(mp, b, o);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MASK32HI)
+__uint32_t
+xfs_mask32hi(int n)
+{
+	return XFS_MASK32HI(n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MASK32LO)
+__uint32_t
+xfs_mask32lo(int n)
+{
+	return XFS_MASK32LO(n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MASK64HI)
+__uint64_t
+xfs_mask64hi(int n)
+{
+	return XFS_MASK64HI(n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MASK64LO)
+__uint64_t
+xfs_mask64lo(int n)
+{
+	return XFS_MASK64LO(n);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MIN_FREELIST)
+int
+xfs_min_freelist(xfs_agf_t *a, xfs_mount_t *mp)
+{
+	return XFS_MIN_FREELIST(a, mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MIN_FREELIST_PAG)
+int
+xfs_min_freelist_pag(xfs_perag_t *pag, xfs_mount_t *mp)
+{
+	return XFS_MIN_FREELIST_PAG(pag, mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MIN_FREELIST_RAW)
+int
+xfs_min_freelist_raw(uint bl, uint cl, xfs_mount_t *mp)
+{
+	return XFS_MIN_FREELIST_RAW(bl, cl, mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_MTOVFS)
+vfs_t *
+xfs_mtovfs(xfs_mount_t *mp)
+{
+	return XFS_MTOVFS(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_OFFBNO_TO_AGINO)
+xfs_agino_t
+xfs_offbno_to_agino(xfs_mount_t *mp, xfs_agblock_t b, int o)
+{
+	return XFS_OFFBNO_TO_AGINO(mp, b, o);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_PREALLOC_BLOCKS)
+xfs_agblock_t
+xfs_prealloc_blocks(xfs_mount_t *mp)
+{
+	return XFS_PREALLOC_BLOCKS(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_BLOCK)
+xfs_agblock_t
+xfs_sb_block(xfs_mount_t *mp)
+{
+	return XFS_SB_BLOCK(mp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_GOOD_VERSION)
+int
+xfs_sb_good_version(xfs_sb_t *sbp)
+{
+	return XFS_SB_GOOD_VERSION(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_ADDATTR)
+void
+xfs_sb_version_addattr(xfs_sb_t *sbp)
+{
+	XFS_SB_VERSION_ADDATTR(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_ADDDALIGN)
+void
+xfs_sb_version_adddalign(xfs_sb_t *sbp)
+{
+	XFS_SB_VERSION_ADDDALIGN(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_ADDNLINK)
+void
+xfs_sb_version_addnlink(xfs_sb_t *sbp)
+{
+	XFS_SB_VERSION_ADDNLINK(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_ADDQUOTA)
+void
+xfs_sb_version_addquota(xfs_sb_t *sbp)
+{
+	XFS_SB_VERSION_ADDQUOTA(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_ADDSHARED)
+void
+xfs_sb_version_addshared(xfs_sb_t *sbp)
+{
+	XFS_SB_VERSION_ADDSHARED(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASALIGN)
+int
+xfs_sb_version_hasalign(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_HASALIGN(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASATTR)
+int
+xfs_sb_version_hasattr(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_HASATTR(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASDALIGN)
+int
+xfs_sb_version_hasdalign(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_HASDALIGN(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASDIRV2)
+int
+xfs_sb_version_hasdirv2(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_HASDIRV2(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASEXTFLGBIT)
+int
+xfs_sb_version_hasextflgbit(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_HASEXTFLGBIT(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASNLINK)
+int
+xfs_sb_version_hasnlink(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_HASNLINK(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASQUOTA)
+int
+xfs_sb_version_hasquota(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_HASQUOTA(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASSHARED)
+int
+xfs_sb_version_hasshared(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_HASSHARED(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_NUM)
+int
+xfs_sb_version_num(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_NUM(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_SUBALIGN)
+void
+xfs_sb_version_subalign(xfs_sb_t *sbp)
+{
+	XFS_SB_VERSION_SUBALIGN(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_SUBSHARED)
+void
+xfs_sb_version_subshared(xfs_sb_t *sbp)
+{
+	XFS_SB_VERSION_SUBSHARED(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_HASLOGV2)
+int
+xfs_sb_version_haslogv2(xfs_sb_t *sbp)
+{
+	return XFS_SB_VERSION_HASLOGV2(sbp);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_TONEW)
+unsigned
+xfs_sb_version_tonew(unsigned v)
+{
+	return XFS_SB_VERSION_TONEW(v);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_SB_VERSION_TOOLD)
+unsigned
+xfs_sb_version_toold(unsigned v)
+{
+	return XFS_SB_VERSION_TOOLD(v);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XLOG_BTOLRBB)
+int
+xlog_btolrbb(int b)
+{
+	return XLOG_BTOLRBB(b);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XLOG_GRANT_ADD_SPACE)
+void
+xlog_grant_add_space(xlog_t *log, int bytes, int type)
+{
+	XLOG_GRANT_ADD_SPACE(log, bytes, type);
+}
+#endif
+
+#if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XLOG_GRANT_SUB_SPACE)
+void
+xlog_grant_sub_space(xlog_t *log, int bytes, int type)
+{
+	XLOG_GRANT_SUB_SPACE(log, bytes, type);
+}
+#endif
diff --git a/fs/xfs/xfs_macros.h b/fs/xfs/xfs_macros.h
new file mode 100644
index 000000000000..2683a48c0fd3
--- /dev/null
+++ b/fs/xfs/xfs_macros.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_MACROS_H__
+#define __XFS_MACROS_H__
+
+/*
+ * Set for debug kernels and simulation, and 32-bit kernels,
+ * but not for standalone.  These replacements save space.
+ * Used in xfs_macros.c.
+ */
+#define XFS_WANT_SPACE_C	\
+	(!defined(_STANDALONE) && \
+	 (defined(DEBUG) || (defined(_KERNEL))))
+
+/*
+ * Set for debug simulation and kernel builds, but not for standalone.
+ * These replacements do not save space.
+ * Used in xfs_macros.c.
+ */
+#define XFS_WANT_FUNCS_C	\
+	(!defined(_STANDALONE) && defined(DEBUG))
+
+/*
+ * Corresponding names used in .h files.
+ */
+#define XFS_WANT_SPACE	(XFS_WANT_SPACE_C && !defined(XFS_MACRO_C))
+#define XFS_WANT_FUNCS	(XFS_WANT_FUNCS_C && !defined(XFS_MACRO_C))
+
+/*
+ * These are the macros that get turned into functions to save space.
+ */
+#define XFSSO_NULLSTARTBLOCK 1
+#define XFSSO_XFS_AGB_TO_DADDR 1
+#define XFSSO_XFS_AGB_TO_FSB 1
+#define XFSSO_XFS_AGINO_TO_INO 1
+#define XFSSO_XFS_ALLOC_BLOCK_MINRECS 1
+#define XFSSO_XFS_ATTR_SF_NEXTENTRY 1
+#define XFSSO_XFS_BMAP_BLOCK_DMAXRECS 1
+#define XFSSO_XFS_BMAP_BLOCK_IMAXRECS 1
+#define XFSSO_XFS_BMAP_BLOCK_IMINRECS 1
+#define XFSSO_XFS_BMAP_INIT 1
+#define XFSSO_XFS_BMAP_PTR_IADDR 1
+#define XFSSO_XFS_BMAP_SANITY_CHECK 1
+#define XFSSO_XFS_BMAPI_AFLAG 1
+#define XFSSO_XFS_CFORK_SIZE 1
+#define XFSSO_XFS_DA_COOKIE_BNO 1
+#define XFSSO_XFS_DA_COOKIE_ENTRY 1
+#define XFSSO_XFS_DADDR_TO_AGBNO 1
+#define XFSSO_XFS_DADDR_TO_FSB 1
+#define XFSSO_XFS_DFORK_PTR 1
+#define XFSSO_XFS_DIR_SF_GET_DIRINO 1
+#define XFSSO_XFS_DIR_SF_NEXTENTRY 1
+#define XFSSO_XFS_DIR_SF_PUT_DIRINO 1
+#define XFSSO_XFS_FILBLKS_MIN 1
+#define XFSSO_XFS_FSB_SANITY_CHECK 1
+#define XFSSO_XFS_FSB_TO_DADDR 1
+#define XFSSO_XFS_FSB_TO_DB 1
+#define XFSSO_XFS_IALLOC_INODES 1
+#define XFSSO_XFS_IFORK_ASIZE 1
+#define XFSSO_XFS_IFORK_DSIZE 1
+#define XFSSO_XFS_IFORK_FORMAT 1
+#define XFSSO_XFS_IFORK_NEXT_SET 1
+#define XFSSO_XFS_IFORK_NEXTENTS 1
+#define XFSSO_XFS_IFORK_PTR 1
+#define XFSSO_XFS_ILOG_FBROOT 1
+#define XFSSO_XFS_ILOG_FEXT 1
+#define XFSSO_XFS_INO_MASK 1
+#define XFSSO_XFS_INO_TO_FSB 1
+#define XFSSO_XFS_INODE_CLEAR_READ_AHEAD 1
+#define XFSSO_XFS_MIN_FREELIST 1
+#define XFSSO_XFS_SB_GOOD_VERSION 1
+#define XFSSO_XFS_SB_VERSION_HASNLINK 1
+#define XFSSO_XLOG_GRANT_ADD_SPACE 1
+#define XFSSO_XLOG_GRANT_SUB_SPACE 1
+
+#endif	/* __XFS_MACROS_H__ */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
new file mode 100644
index 000000000000..e59a21d3348b
--- /dev/null
+++ b/fs/xfs/xfs_mount.c
@@ -0,0 +1,1737 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+STATIC void	xfs_mount_reset_sbqflags(xfs_mount_t *);
+STATIC void	xfs_mount_log_sbunit(xfs_mount_t *, __int64_t);
+STATIC int	xfs_uuid_mount(xfs_mount_t *);
+
+mutex_t		xfs_uuidtabmon;		/* monitor for uuidtab */
+STATIC int	xfs_uuidtab_size;
+STATIC uuid_t	*xfs_uuidtab;
+
+STATIC void	xfs_uuid_unmount(xfs_mount_t *);
+
+void xfs_xlatesb(void *, xfs_sb_t *, int, xfs_arch_t, __int64_t);
+
+static struct {
+    short offset;
+    short type;	    /* 0 = integer
+		     * 1 = binary / string (no translation)
+		     */
+} xfs_sb_info[] = {
+    { offsetof(xfs_sb_t, sb_magicnum),	 0 },
+    { offsetof(xfs_sb_t, sb_blocksize),	 0 },
+    { offsetof(xfs_sb_t, sb_dblocks),	 0 },
+    { offsetof(xfs_sb_t, sb_rblocks),	 0 },
+    { offsetof(xfs_sb_t, sb_rextents),	 0 },
+    { offsetof(xfs_sb_t, sb_uuid),	 1 },
+    { offsetof(xfs_sb_t, sb_logstart),	 0 },
+    { offsetof(xfs_sb_t, sb_rootino),	 0 },
+    { offsetof(xfs_sb_t, sb_rbmino),	 0 },
+    { offsetof(xfs_sb_t, sb_rsumino),	 0 },
+    { offsetof(xfs_sb_t, sb_rextsize),	 0 },
+    { offsetof(xfs_sb_t, sb_agblocks),	 0 },
+    { offsetof(xfs_sb_t, sb_agcount),	 0 },
+    { offsetof(xfs_sb_t, sb_rbmblocks),	 0 },
+    { offsetof(xfs_sb_t, sb_logblocks),	 0 },
+    { offsetof(xfs_sb_t, sb_versionnum), 0 },
+    { offsetof(xfs_sb_t, sb_sectsize),	 0 },
+    { offsetof(xfs_sb_t, sb_inodesize),	 0 },
+    { offsetof(xfs_sb_t, sb_inopblock),	 0 },
+    { offsetof(xfs_sb_t, sb_fname[0]),	 1 },
+    { offsetof(xfs_sb_t, sb_blocklog),	 0 },
+    { offsetof(xfs_sb_t, sb_sectlog),	 0 },
+    { offsetof(xfs_sb_t, sb_inodelog),	 0 },
+    { offsetof(xfs_sb_t, sb_inopblog),	 0 },
+    { offsetof(xfs_sb_t, sb_agblklog),	 0 },
+    { offsetof(xfs_sb_t, sb_rextslog),	 0 },
+    { offsetof(xfs_sb_t, sb_inprogress), 0 },
+    { offsetof(xfs_sb_t, sb_imax_pct),	 0 },
+    { offsetof(xfs_sb_t, sb_icount),	 0 },
+    { offsetof(xfs_sb_t, sb_ifree),	 0 },
+    { offsetof(xfs_sb_t, sb_fdblocks),	 0 },
+    { offsetof(xfs_sb_t, sb_frextents),	 0 },
+    { offsetof(xfs_sb_t, sb_uquotino),	 0 },
+    { offsetof(xfs_sb_t, sb_gquotino),	 0 },
+    { offsetof(xfs_sb_t, sb_qflags),	 0 },
+    { offsetof(xfs_sb_t, sb_flags),	 0 },
+    { offsetof(xfs_sb_t, sb_shared_vn),	 0 },
+    { offsetof(xfs_sb_t, sb_inoalignmt), 0 },
+    { offsetof(xfs_sb_t, sb_unit),	 0 },
+    { offsetof(xfs_sb_t, sb_width),	 0 },
+    { offsetof(xfs_sb_t, sb_dirblklog),	 0 },
+    { offsetof(xfs_sb_t, sb_dummy),	 1 },
+    { offsetof(xfs_sb_t, sb_logsunit),	 0 },
+    { sizeof(xfs_sb_t),			 0 }
+};
+
+/*
+ * Return a pointer to an initialized xfs_mount structure.
+ */
+xfs_mount_t *
+xfs_mount_init(void)
+{
+	xfs_mount_t *mp;
+
+	mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
+
+	AIL_LOCKINIT(&mp->m_ail_lock, "xfs_ail");
+	spinlock_init(&mp->m_sb_lock, "xfs_sb");
+	mutex_init(&mp->m_ilock, MUTEX_DEFAULT, "xfs_ilock");
+	initnsema(&mp->m_growlock, 1, "xfs_grow");
+	/*
+	 * Initialize the AIL.
+	 */
+	xfs_trans_ail_init(mp);
+
+	/* Init freeze sync structures */
+	spinlock_init(&mp->m_freeze_lock, "xfs_freeze");
+	init_sv(&mp->m_wait_unfreeze, SV_DEFAULT, "xfs_freeze", 0);
+	atomic_set(&mp->m_active_trans, 0);
+
+	return mp;
+}	/* xfs_mount_init */
+
+/*
+ * Free up the resources associated with a mount structure.  Assume that
+ * the structure was initially zeroed, so we can tell which fields got
+ * initialized.
+ */
+void
+xfs_mount_free(
+	xfs_mount_t *mp,
+	int	    remove_bhv)
+{
+	if (mp->m_ihash)
+		xfs_ihash_free(mp);
+	if (mp->m_chash)
+		xfs_chash_free(mp);
+
+	if (mp->m_perag) {
+		int	agno;
+
+		for (agno = 0; agno < mp->m_maxagi; agno++)
+			if (mp->m_perag[agno].pagb_list)
+				kmem_free(mp->m_perag[agno].pagb_list,
+				  sizeof(xfs_perag_busy_t) * XFS_PAGB_NUM_SLOTS);
+		kmem_free(mp->m_perag,
+			  sizeof(xfs_perag_t) * mp->m_sb.sb_agcount);
+	}
+
+#if 0
+	/*
+	 * XXXdpd - Doesn't work now for shutdown case.
+	 * Should at least free the memory.
+	 */
+	ASSERT(mp->m_ail.ail_back == (xfs_log_item_t*)&(mp->m_ail));
+	ASSERT(mp->m_ail.ail_forw == (xfs_log_item_t*)&(mp->m_ail));
+#endif
+	AIL_LOCK_DESTROY(&mp->m_ail_lock);
+	spinlock_destroy(&mp->m_sb_lock);
+	mutex_destroy(&mp->m_ilock);
+	freesema(&mp->m_growlock);
+
+	if (mp->m_fsname != NULL) {
+		kmem_free(mp->m_fsname, mp->m_fsname_len);
+	}
+	if (mp->m_quotainfo != NULL) {
+		xfs_qm_unmount_quotadestroy(mp);
+	}
+
+	if (remove_bhv) {
+		VFS_REMOVEBHV(XFS_MTOVFS(mp), &mp->m_bhv);
+	}
+	spinlock_destroy(&mp->m_freeze_lock);
+	sv_destroy(&mp->m_wait_unfreeze);
+	kmem_free(mp, sizeof(xfs_mount_t));
+}
+
+
+/*
+ * Check the validity of the SB found.
+ */
+STATIC int
+xfs_mount_validate_sb(
+	xfs_mount_t	*mp,
+	xfs_sb_t	*sbp)
+{
+	/*
+	 * If the log device and data device have the
+	 * same device number, the log is internal.
+	 * Consequently, the sb_logstart should be non-zero.  If
+	 * we have a zero sb_logstart in this case, we may be trying to mount
+	 * a volume filesystem in a non-volume manner.
+	 */
+	if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+		cmn_err(CE_WARN, "XFS: bad magic number");
+		return XFS_ERROR(EWRONGFS);
+	}
+
+	if (!XFS_SB_GOOD_VERSION(sbp)) {
+		cmn_err(CE_WARN, "XFS: bad version");
+		return XFS_ERROR(EWRONGFS);
+	}
+
+	if (sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp) {
+		cmn_err(CE_WARN, "XFS: filesystem is marked as having an external log; specify logdev on the\nmount command line.");
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	if (sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp) {
+		cmn_err(CE_WARN, "XFS: filesystem is marked as having an internal log; don't specify logdev on\nthe mount command line.");
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	/*
+	 * More sanity checking. These were stolen directly from
+	 * xfs_repair.
+	 */
+	if (sbp->sb_blocksize <= 0					||
+	    sbp->sb_agcount <= 0					||
+	    sbp->sb_sectsize <= 0					||
+	    sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG			||
+	    sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG			||
+	    sbp->sb_inodesize < XFS_DINODE_MIN_SIZE			||
+	    sbp->sb_inodesize > XFS_DINODE_MAX_SIZE			||
+	    (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)	||
+	    (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)	||
+	    sbp->sb_imax_pct > 100) {
+		cmn_err(CE_WARN, "XFS: SB sanity check 1 failed");
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	/*
+	 * sanity check ag count, size fields against data size field
+	 */
+	if (sbp->sb_dblocks == 0 ||
+	    sbp->sb_dblocks >
+	     (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks ||
+	    sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) *
+			      sbp->sb_agblocks + XFS_MIN_AG_BLOCKS) {
+		cmn_err(CE_WARN, "XFS: SB sanity check 2 failed");
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+#if !XFS_BIG_FILESYSTEMS
+	if (sbp->sb_dblocks > INT_MAX || sbp->sb_rblocks > INT_MAX)  {
+		cmn_err(CE_WARN,
+"XFS:  File systems greater than 1TB not supported on this system.");
+		return XFS_ERROR(E2BIG);
+	}
+#endif
+
+	if (sbp->sb_inprogress) {
+		cmn_err(CE_WARN, "XFS: file system busy");
+		return XFS_ERROR(EFSCORRUPTED);
+	}
+
+	/*
+	 * Until this is fixed only page-sized or smaller data blocks work.
+	 */
+	if (sbp->sb_blocksize > PAGE_SIZE) {
+		cmn_err(CE_WARN,
+		"XFS: Trying to mount file system with blocksize %d bytes",
+			sbp->sb_blocksize);
+		cmn_err(CE_WARN,
+		"XFS: Only page-sized (%d bytes) or less blocksizes currently work.",
+			PAGE_SIZE);
+		return XFS_ERROR(EWRONGFS);
+	}
+	return (0);
+}
+
+void
+xfs_initialize_perag(xfs_mount_t *mp, int agcount)
+{
+	int		index, max_metadata;
+	xfs_perag_t	*pag;
+	xfs_agino_t	agino;
+	xfs_ino_t	ino;
+	xfs_sb_t	*sbp = &mp->m_sb;
+	xfs_ino_t	max_inum = XFS_MAXINUMBER_32;
+
+	/* Check to see if the filesystem can overflow 32 bit inodes */
+	agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+	ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
+
+	/* Clear the mount flag if no inode can overflow 32 bits
+	 * on this filesystem.
+	 */
+	if (ino <= max_inum) {
+		mp->m_flags &= ~XFS_MOUNT_32BITINODES;
+	}
+
+	/* If we can overflow then setup the ag headers accordingly */
+	if (mp->m_flags & XFS_MOUNT_32BITINODES) {
+		/* Calculate how much should be reserved for inodes to
+		 * meet the max inode percentage.
+		 */
+		if (mp->m_maxicount) {
+			__uint64_t	icount;
+
+			icount = sbp->sb_dblocks * sbp->sb_imax_pct;
+			do_div(icount, 100);
+			icount += sbp->sb_agblocks - 1;
+			do_div(icount, mp->m_ialloc_blks);
+			max_metadata = icount;
+		} else {
+			max_metadata = agcount;
+		}
+		for (index = 0; index < agcount; index++) {
+			ino = XFS_AGINO_TO_INO(mp, index, agino);
+			if (ino > max_inum) {
+				index++;
+				break;
+			}
+
+			/* This ag is prefered for inodes */
+			pag = &mp->m_perag[index];
+			pag->pagi_inodeok = 1;
+			if (index < max_metadata)
+				pag->pagf_metadata = 1;
+		}
+	} else {
+		/* Setup default behavior for smaller filesystems */
+		for (index = 0; index < agcount; index++) {
+			pag = &mp->m_perag[index];
+			pag->pagi_inodeok = 1;
+		}
+	}
+	mp->m_maxagi = index;
+}
+
+/*
+ * xfs_xlatesb
+ *
+ *     data	  - on disk version of sb
+ *     sb	  - a superblock
+ *     dir	  - conversion direction: <0 - convert sb to buf
+ *					  >0 - convert buf to sb
+ *     arch	  - architecture to read/write from/to buf
+ *     fields	  - which fields to copy (bitmask)
+ */
+void
+xfs_xlatesb(
+	void		*data,
+	xfs_sb_t	*sb,
+	int		dir,
+	xfs_arch_t	arch,
+	__int64_t	fields)
+{
+	xfs_caddr_t	buf_ptr;
+	xfs_caddr_t	mem_ptr;
+	xfs_sb_field_t	f;
+	int		first;
+	int		size;
+
+	ASSERT(dir);
+	ASSERT(fields);
+
+	if (!fields)
+		return;
+
+	buf_ptr = (xfs_caddr_t)data;
+	mem_ptr = (xfs_caddr_t)sb;
+
+	while (fields) {
+		f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+		first = xfs_sb_info[f].offset;
+		size = xfs_sb_info[f + 1].offset - first;
+
+		ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
+
+		if (arch == ARCH_NOCONVERT ||
+		    size == 1 ||
+		    xfs_sb_info[f].type == 1) {
+			if (dir > 0) {
+				bcopy(buf_ptr + first, mem_ptr + first, size);
+			} else {
+				bcopy(mem_ptr + first, buf_ptr + first, size);
+			}
+		} else {
+			switch (size) {
+			case 2:
+				INT_XLATE(*(__uint16_t*)(buf_ptr+first),
+					  *(__uint16_t*)(mem_ptr+first),
+					  dir, arch);
+				break;
+			case 4:
+				INT_XLATE(*(__uint32_t*)(buf_ptr+first),
+					  *(__uint32_t*)(mem_ptr+first),
+					  dir, arch);
+				break;
+			case 8:
+				INT_XLATE(*(__uint64_t*)(buf_ptr+first),
+					  *(__uint64_t*)(mem_ptr+first), dir, arch);
+				break;
+			default:
+				ASSERT(0);
+			}
+		}
+
+		fields &= ~(1LL << f);
+	}
+}
+
+/*
+ * xfs_readsb
+ *
+ * Does the initial read of the superblock.
+ */
+int
+xfs_readsb(xfs_mount_t *mp)
+{
+	xfs_buf_t	*bp;
+	xfs_sb_t	*sbp;
+	int		error = 0;
+
+	ASSERT(mp->m_sb_bp == 0);
+
+	/*
+	 * Allocate a (locked) buffer to hold the superblock.
+	 * This will be kept around at all time to optimize
+	 * access to the superblock.
+	 */
+	bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR, 1,
+		PBF_LOCK|PBF_READ|PBF_MAPPED|PBF_MAPPABLE|PBF_FS_MANAGED);
+	ASSERT(bp != NULL);
+	ASSERT(XFS_BUF_ISBUSY(bp) && XFS_BUF_VALUSEMA(bp) <= 0);
+
+	/*
+	 * Initialize the mount structure from the superblock.
+	 * But first do some basic consistency checking.
+	 */
+	sbp = XFS_BUF_TO_SBP(bp);
+	xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), 1, ARCH_CONVERT, XFS_SB_ALL_BITS);
+	if ((error = xfs_mount_validate_sb(mp, &(mp->m_sb)))) {
+		cmn_err(CE_WARN, "XFS: SB validate failed");
+		goto err;
+	}
+
+	mp->m_sb_bp = bp;
+	xfs_buf_relse(bp);
+	ASSERT(XFS_BUF_VALUSEMA(bp) > 0);
+	return 0;
+
+ err:
+	bp->pb_flags &= ~PBF_FS_MANAGED;
+	xfs_buf_relse(bp);
+	return error;
+}
+
+
+/*
+ * xfs_mount_common
+ *
+ * Mount initialization code establishing various mount
+ * fields from the superblock associated with the given
+ * mount structure
+ */
+void
+xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
+{
+	int	i;
+
+	mp->m_agfrotor = mp->m_agirotor = 0;
+	mp->m_maxagi = mp->m_sb.sb_agcount;
+	mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
+	mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
+	mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
+	mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
+	mp->m_litino = sbp->sb_inodesize -
+		((uint)sizeof(xfs_dinode_core_t) + (uint)sizeof(xfs_agino_t));
+	mp->m_blockmask = sbp->sb_blocksize - 1;
+	mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
+	mp->m_blockwmask = mp->m_blockwsize - 1;
+
+
+	if (XFS_SB_VERSION_HASLOGV2(sbp)) {
+		if (sbp->sb_logsunit <= 1) {
+			mp->m_lstripemask = 1;
+		} else {
+			mp->m_lstripemask =
+				1 << xfs_highbit32(sbp->sb_logsunit >> BBSHIFT);
+		}
+	}
+
+	/*
+	 * Setup for attributes, in case they get created.
+	 * This value is for inodes getting attributes for the first time,
+	 * the per-inode value is for old attribute values.
+	 */
+	ASSERT(sbp->sb_inodesize >= 256 && sbp->sb_inodesize <= 2048);
+	switch (sbp->sb_inodesize) {
+	case 256:
+		mp->m_attroffset = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(2);
+		break;
+	case 512:
+	case 1024:
+	case 2048:
+		mp->m_attroffset = XFS_BMDR_SPACE_CALC(12);
+		break;
+	default:
+		ASSERT(0);
+	}
+	ASSERT(mp->m_attroffset < XFS_LITINO(mp));
+
+	for (i = 0; i < 2; i++) {
+		mp->m_alloc_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
+			xfs_alloc, i == 0);
+		mp->m_alloc_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
+			xfs_alloc, i == 0);
+	}
+	for (i = 0; i < 2; i++) {
+		mp->m_bmap_dmxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
+			xfs_bmbt, i == 0);
+		mp->m_bmap_dmnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
+			xfs_bmbt, i == 0);
+	}
+	for (i = 0; i < 2; i++) {
+		mp->m_inobt_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
+			xfs_inobt, i == 0);
+		mp->m_inobt_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
+			xfs_inobt, i == 0);
+	}
+
+	mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
+	mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
+					sbp->sb_inopblock);
+	mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
+}
+
+extern void xfs_refcache_sbdirty(struct super_block*);
+
+/*
+ * xfs_mountfs
+ *
+ * This function does the following on an initial mount of a file system:
+ *	- reads the superblock from disk and init the mount struct
+ *	- if we're a 32-bit kernel, do a size check on the superblock
+ *		so we don't mount terabyte filesystems
+ *	- init mount struct realtime fields
+ *	- allocate inode hash table for fs
+ *	- init directory manager
+ *	- perform recovery and init the log manager
+ */
+int
+xfs_mountfs(
+	vfs_t		*vfsp,
+	xfs_mount_t	*mp,
+	dev_t 		dev, 
+	int		mfsi_flags)
+{
+	xfs_buf_t	*bp;
+	xfs_sb_t	*sbp = &(mp->m_sb);
+	int		error = 0;
+	xfs_inode_t	*rip;
+	vnode_t		*rvp = 0;
+	int		readio_log;
+	int		writeio_log;
+	vmap_t		vmap;
+	xfs_daddr_t	d;
+	extern xfs_ioops_t xfs_iocore_xfs;	/* from xfs_iocore.c */
+	__uint64_t	ret64;
+	uint		quotaflags, quotaondisk, rootqcheck, needquotacheck;
+	boolean_t	needquotamount;
+	__int64_t	update_flags;
+	int		agno, noio;
+	int		uuid_mounted = 0;
+
+	noio = dev == 0 && mp->m_sb_bp != NULL;
+	if (mp->m_sb_bp == NULL) {
+		if ((error = xfs_readsb(mp))) {
+			return (error);
+		}
+	}
+	xfs_mount_common(mp, sbp);
+
+	/*
+	 * Check if sb_agblocks is aligned at stripe boundary
+	 * If sb_agblocks is NOT aligned turn off m_dalign since
+	 * allocator alignment is within an ag, therefore ag has
+	 * to be aligned at stripe boundary.
+	 */
+	update_flags = 0LL;
+	if (mp->m_dalign && !(mfsi_flags & XFS_MFSI_SECOND)) {
+		/*
+		 * If stripe unit and stripe width are not multiples
+		 * of the fs blocksize turn off alignment.
+		 */
+		if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
+		    (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
+			if (mp->m_flags & XFS_MOUNT_RETERR) {
+				cmn_err(CE_WARN, "XFS: alignment check 1 failed");
+				error = XFS_ERROR(EINVAL);
+				goto error1;
+			}
+			mp->m_dalign = mp->m_swidth = 0;
+		} else {
+			/*
+			 * Convert the stripe unit and width to FSBs.
+			 */
+			mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
+			if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
+				if (mp->m_flags & XFS_MOUNT_RETERR) {
+					error = XFS_ERROR(EINVAL);
+					goto error1;
+				}
+				mp->m_dalign = 0;
+				mp->m_swidth = 0;
+			} else if (mp->m_dalign) {
+				mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
+			} else {
+				if (mp->m_flags & XFS_MOUNT_RETERR) {
+					cmn_err(CE_WARN, "XFS: alignment check 3 failed");
+					error = XFS_ERROR(EINVAL);
+					goto error1;
+				}
+				mp->m_swidth = 0;
+			}
+		}
+
+		/*
+		 * Update superblock with new values
+		 * and log changes
+		 */
+		if (XFS_SB_VERSION_HASDALIGN(sbp)) {
+			if (sbp->sb_unit != mp->m_dalign) {
+				sbp->sb_unit = mp->m_dalign;
+				update_flags |= XFS_SB_UNIT;
+			}
+			if (sbp->sb_width != mp->m_swidth) {
+				sbp->sb_width = mp->m_swidth;
+				update_flags |= XFS_SB_WIDTH;
+			}
+		}
+	} else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
+		    XFS_SB_VERSION_HASDALIGN(&mp->m_sb)) {
+			mp->m_dalign = sbp->sb_unit;
+			mp->m_swidth = sbp->sb_width;
+	}
+
+	xfs_alloc_compute_maxlevels(mp);
+	xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
+	xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
+	xfs_ialloc_compute_maxlevels(mp);
+
+	if (sbp->sb_imax_pct) {
+		__uint64_t	icount;
+
+		/* Make sure the maximum inode count is a multiple of the
+		 * units we allocate inodes in.
+		 */
+
+		icount = sbp->sb_dblocks * sbp->sb_imax_pct;
+		do_div(icount, 100);
+		do_div(icount, mp->m_ialloc_blks);
+		mp->m_maxicount = (icount * mp->m_ialloc_blks)	<<
+				   sbp->sb_inopblog;
+	} else
+		mp->m_maxicount = 0;
+
+	/*
+	 * XFS uses the uuid from the superblock as the unique
+	 * identifier for fsid.	 We can not use the uuid from the volume
+	 * since a single partition filesystem is identical to a single
+	 * partition volume/filesystem.
+	 */
+	if ((mfsi_flags & XFS_MFSI_SECOND) == 0 && (mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
+		if (xfs_uuid_mount(mp)) {
+			error = XFS_ERROR(EINVAL);
+			goto error1;
+		}
+		uuid_mounted=1;
+		ret64 = uuid_hash64(&sbp->sb_uuid);
+		bcopy(&ret64, &vfsp->vfs_fsid, sizeof(ret64));
+	}
+
+	/*
+	 * Set the default minimum read and write sizes unless
+	 * already specified in a mount option.
+	 * We use smaller I/O sizes when the file system
+	 * is being used for NFS service (wsync mount option).
+	 */
+	if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+		if (mp->m_flags & XFS_MOUNT_WSYNC) {
+			readio_log = XFS_WSYNC_READIO_LOG;
+			writeio_log = XFS_WSYNC_WRITEIO_LOG;
+		} else {
+			readio_log = XFS_READIO_LOG_LARGE;
+			writeio_log = XFS_WRITEIO_LOG_LARGE;
+		}
+	} else {
+		readio_log = mp->m_readio_log;
+		writeio_log = mp->m_writeio_log;
+	}
+
+	/*
+	 * Set the number of readahead buffers to use based on
+	 * physical memory size.
+	 */
+	if (xfs_physmem <= 4096)		/* <= 16MB */
+		mp->m_nreadaheads = XFS_RW_NREADAHEAD_16MB;
+	else if (xfs_physmem <= 8192)	/* <= 32MB */
+		mp->m_nreadaheads = XFS_RW_NREADAHEAD_32MB;
+	else
+		mp->m_nreadaheads = XFS_RW_NREADAHEAD_K32;
+	if (sbp->sb_blocklog > readio_log) {
+		mp->m_readio_log = sbp->sb_blocklog;
+	} else {
+		mp->m_readio_log = readio_log;
+	}
+	mp->m_readio_blocks = 1 << (mp->m_readio_log - sbp->sb_blocklog);
+	if (sbp->sb_blocklog > writeio_log) {
+		mp->m_writeio_log = sbp->sb_blocklog;
+	} else {
+		mp->m_writeio_log = writeio_log;
+	}
+	mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog);
+
+	/*
+	 * Set the inode cluster size based on the physical memory
+	 * size.  This may still be overridden by the file system
+	 * block size if it is larger than the chosen cluster size.
+	 */
+	if (xfs_physmem <= btoc(32 * 1024 * 1024)) { /* <= 32 MB */
+		mp->m_inode_cluster_size = XFS_INODE_SMALL_CLUSTER_SIZE;
+	} else {
+		mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
+	}
+	/*
+	 * Set whether we're using inode alignment.
+	 */
+	if (XFS_SB_VERSION_HASALIGN(&mp->m_sb) &&
+	    mp->m_sb.sb_inoalignmt >=
+	    XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
+		mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1;
+	else
+		mp->m_inoalign_mask = 0;
+	/*
+	 * If we are using stripe alignment, check whether
+	 * the stripe unit is a multiple of the inode alignment
+	 */
+	if (mp->m_dalign && mp->m_inoalign_mask &&
+	    !(mp->m_dalign & mp->m_inoalign_mask))
+		mp->m_sinoalign = mp->m_dalign;
+	else
+		mp->m_sinoalign = 0;
+	/*
+	 * Check that the data (and log if separate) are an ok size.
+	 */
+	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
+	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
+		cmn_err(CE_WARN, "XFS: size check 1 failed");
+		error = XFS_ERROR(E2BIG);
+		goto error1;
+	}
+	if (!noio) {
+		error = xfs_read_buf(mp, mp->m_ddev_targp, d - 1, 1, 0, &bp);
+		if (!error) {
+			xfs_buf_relse(bp);
+		} else {
+			cmn_err(CE_WARN, "XFS: size check 2 failed");
+			if (error == ENOSPC) {
+				error = XFS_ERROR(E2BIG);
+			}
+			goto error1;
+		}
+	}
+
+	if (!noio && ((mfsi_flags & XFS_MFSI_CLIENT) == 0) &&
+	    mp->m_logdev_targp != mp->m_ddev_targp) {
+		d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
+		if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
+			cmn_err(CE_WARN, "XFS: size check 3 failed");
+			error = XFS_ERROR(E2BIG);
+			goto error1;
+		}
+		error = xfs_read_buf(mp, mp->m_logdev_targp, d - 1, 1, 0, &bp);
+		if (!error) {
+			xfs_buf_relse(bp);
+		} else {
+			cmn_err(CE_WARN, "XFS: size check 3 failed");
+			if (error == ENOSPC) {
+				error = XFS_ERROR(E2BIG);
+			}
+			goto error1;
+		}
+	}
+
+	/*
+	 * Disallow mount attempts with (IRIX) project quota enabled
+	 */
+	if (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) &&
+	    (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT)) {
+		cmn_err(CE_WARN, "XFS: IRIX project quota are enabled");
+		error = XFS_ERROR(ENOSYS);
+		goto error1;
+	}
+
+	/*
+	 * Initialize realtime fields in the mount structure
+	 */
+	if ((error = xfs_rtmount_init(mp))) {
+		cmn_err(CE_WARN, "XFS: RT mount failed");
+		goto error1;
+	}
+
+	/*
+	 * For client case we are done now
+	 */
+	if (mfsi_flags & XFS_MFSI_CLIENT) {
+		return(0);
+	}
+
+	/*
+	 * Set up timer list structure for nfs refcache
+	 */
+	init_timer(&mp->m_sbdirty_timer);
+	mp->m_sbdirty_timer.function = (void (*)(unsigned long)) xfs_refcache_sbdirty;
+
+	/* Initialize the I/O function vector with XFS functions */
+	mp->m_io_ops = xfs_iocore_xfs;
+
+	/*
+	 *  Copies the low order bits of the timestamp and the randomly
+	 *  set "sequence" number out of a UUID.
+	 */
+	uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid);
+
+	/*
+	 *  The vfs structure needs to have a file system independent
+	 *  way of checking for the invariant file system ID.  Since it
+	 *  can't look at mount structures it has a pointer to the data
+	 *  in the mount structure.
+	 *
+	 *  File systems that don't support user level file handles (i.e.
+	 *  all of them except for XFS) will leave vfs_altfsid as NULL.
+	 */
+	vfsp->vfs_altfsid = (fsid_t *)mp->m_fixedfsid;
+	mp->m_dmevmask = 0;	/* not persistent; set after each mount */
+
+	/*
+	 * Select the right directory manager.
+	 */
+	mp->m_dirops =
+		XFS_SB_VERSION_HASDIRV2(&mp->m_sb) ?
+			xfsv2_dirops :
+			xfsv1_dirops;
+
+	/*
+	 * Initialize directory manager's entries.
+	 */
+	XFS_DIR_MOUNT(mp);
+
+	/*
+	 * Initialize the attribute manager's entries.
+	 */
+	mp->m_attr_magicpct = (mp->m_sb.sb_blocksize * 37) / 100;
+
+	/*
+	 * Initialize the precomputed transaction reservations values.
+	 */
+	xfs_trans_init(mp);
+	if (noio) {
+		ASSERT((mfsi_flags & XFS_MFSI_CLIENT) == 0);
+		return 0;
+	}
+
+	/*
+	 * Allocate and initialize the inode hash table for this
+	 * file system.
+	 */
+	xfs_ihash_init(mp);
+	xfs_chash_init(mp);
+
+	/*
+	 * Allocate and initialize the per-ag data.
+	 */
+	init_rwsem(&mp->m_peraglock);
+	mp->m_perag =
+		kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), KM_SLEEP);
+
+	xfs_initialize_perag(mp, sbp->sb_agcount);
+
+	/*
+	 * log's mount-time initialization. Perform 1st part recovery if needed
+	 */
+	if (sbp->sb_logblocks > 0) {		/* check for volume case */
+		error = xfs_log_mount(mp, mp->m_logdev_targp->pbr_dev,
+				      XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
+				      XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
+		if (error) {
+			cmn_err(CE_WARN, "XFS: log mount failed");
+			goto error2;
+		}
+	} else {	/* No log has been defined */
+		cmn_err(CE_WARN, "XFS: no log defined");
+		error = XFS_ERROR(EFSCORRUPTED);
+		goto error2;
+	}
+
+	/*
+	 * Get and sanity-check the root inode.
+	 * Save the pointer to it in the mount structure.
+	 */
+	error = xfs_iget(mp, NULL, sbp->sb_rootino, XFS_ILOCK_EXCL, &rip, 0);
+	if (error) {
+		cmn_err(CE_WARN, "XFS: failed to read root inode");
+		goto error3;
+	}
+
+	ASSERT(rip != NULL);
+	rvp = XFS_ITOV(rip);
+	if ((rip->i_d.di_mode & IFMT) != IFDIR) {
+		cmn_err(CE_WARN, "XFS: corrupted root inode");
+		VMAP(rvp, rip, vmap);
+		prdev("Root inode %llu is not a directory",
+		      mp->m_dev, (unsigned long long)rip->i_ino);
+		rvp->v_flag |= VPURGE;
+		xfs_iunlock(rip, XFS_ILOCK_EXCL);
+		VN_RELE(rvp);
+		vn_purge(rvp, &vmap);
+		error = XFS_ERROR(EFSCORRUPTED);
+		goto error3;
+	}
+	mp->m_rootip = rip;	/* save it */
+
+	xfs_iunlock(rip, XFS_ILOCK_EXCL);
+
+	quotaondisk = XFS_SB_VERSION_HASQUOTA(&mp->m_sb) &&
+		mp->m_sb.sb_qflags & (XFS_UQUOTA_ACCT|XFS_GQUOTA_ACCT);
+
+	/*
+	 * If the device itself is read-only, we can't allow
+	 * the user to change the state of quota on the mount -
+	 * this would generate a transaction on the ro device,
+	 * which would lead to an I/O error and shutdown
+	 */
+
+	if (((quotaondisk && !XFS_IS_QUOTA_ON(mp)) ||
+	      (!quotaondisk && XFS_IS_QUOTA_ON(mp))) &&
+	    (bdev_read_only(mp->m_ddev_targp->pbr_bdev) ||
+	     bdev_read_only(mp->m_logdev_targp->pbr_bdev))) {
+		cmn_err(CE_WARN,
+			"XFS: device %s is read-only, cannot change "
+			"quota state.  Please mount with%s quota option.",
+			mp->m_fsname, quotaondisk ? "" : "out");
+		rvp->v_flag |= VPURGE;
+		VN_RELE(rvp);
+		vn_remove(rvp);
+		error = XFS_ERROR(EPERM);
+		goto error3;
+	}
+
+	/*
+	 * Initialize realtime inode pointers in the mount structure
+	 */
+	if ((error = xfs_rtmount_inodes(mp))) {
+		/*
+		 * Free up the root inode.
+		 */
+		cmn_err(CE_WARN, "XFS: failed to read RT inodes");
+		rvp->v_flag |= VPURGE;
+		VMAP(rvp, rip, vmap);
+		VN_RELE(rvp);
+		vn_purge(rvp, &vmap);
+		goto error3;
+	}
+
+	/*
+	 * If fs is not mounted readonly, then update the superblock
+	 * unit and width changes.
+	 */
+	if (update_flags && !(vfsp->vfs_flag & VFS_RDONLY))
+		xfs_mount_log_sbunit(mp, update_flags);
+
+	quotaflags = 0;
+	needquotamount = B_FALSE;
+
+	/*
+	 * Figure out if we'll need to do a quotacheck.
+	 * The requirements are a little different depending on whether
+	 * this fs is root or not.
+	 */
+	rootqcheck = (mp->m_dev == rootdev && quotaondisk && 
+		      ((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT &&
+			(mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) ||
+		       (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT &&
+			(mp->m_sb.sb_qflags & XFS_GQUOTA_CHKD) == 0)));
+	needquotacheck = rootqcheck ||	XFS_QM_NEED_QUOTACHECK(mp);
+	if (XFS_IS_QUOTA_ON(mp) || quotaondisk) {
+		/*
+		 * Call mount_quotas at this point only if we won't have to do
+		 * a quotacheck.
+		 */
+		if (quotaondisk && !needquotacheck) {
+			/*
+			 * If the xfs quota code isn't installed,
+			 * we have to reset the quotachk'd bit.
+			 * If an error occured, qm_mount_quotas code
+			 * has already disabled quotas. So, just finish
+			 * mounting, and get on with the boring life
+			 * without disk quotas.
+			 */
+			if (xfs_qm_mount_quotas(mp))
+				xfs_mount_reset_sbqflags(mp);
+		} else {
+			/*
+			 * Clear the quota flags, but remember them. This
+			 * is so that the quota code doesn't get invoked
+			 * before we're ready. This can happen when an
+			 * inode goes inactive and wants to free blocks,
+			 * or via xfs_log_mount_finish.
+			 */
+			quotaflags = mp->m_qflags;
+			mp->m_qflags = 0;
+			needquotamount = B_TRUE;
+		}
+	}
+
+	/*
+	 * Finish recovering the file system.  This part needed to be
+	 * delayed until after the root and real-time bitmap inodes
+	 * were consistently read in.
+	 */
+	error = xfs_log_mount_finish(mp, mfsi_flags);
+	if (error) {
+		cmn_err(CE_WARN, "XFS: log mount finish failed");
+		goto error3;
+	}
+
+	if (needquotamount) {
+		ASSERT(mp->m_qflags == 0);
+		mp->m_qflags = quotaflags;
+		rootqcheck = ((XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY) &&
+				mp->m_dev == rootdev && needquotacheck);
+		if (rootqcheck && (error = xfs_quotacheck_read_only(mp)))
+			goto error2;
+		if (xfs_qm_mount_quotas(mp))
+			xfs_mount_reset_sbqflags(mp);
+		if (rootqcheck)
+			XFS_MTOVFS(mp)->vfs_flag |= VFS_RDONLY;
+	}
+
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+	if (! (XFS_IS_QUOTA_ON(mp)))
+		xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
+	else
+		xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
+#endif
+
+#ifdef QUOTADEBUG
+	if (XFS_IS_QUOTA_ON(mp) && xfs_qm_internalqcheck(mp))
+		cmn_err(CE_WARN, "XFS: mount internalqcheck failed");
+#endif
+
+	return (0);
+
+ error3:
+	xfs_log_unmount_dealloc(mp);
+ error2:
+	xfs_ihash_free(mp);
+	xfs_chash_free(mp);
+	for (agno = 0; agno < sbp->sb_agcount; agno++)
+		if (mp->m_perag[agno].pagb_list)
+			kmem_free(mp->m_perag[agno].pagb_list,
+			  sizeof(xfs_perag_busy_t) * XFS_PAGB_NUM_SLOTS);
+	kmem_free(mp->m_perag, sbp->sb_agcount * sizeof(xfs_perag_t));
+	mp->m_perag = NULL;
+	/* FALLTHROUGH */
+ error1:
+	if (uuid_mounted)
+		xfs_uuid_unmount(mp);
+	xfs_freesb(mp);
+	return error;
+}
+
+/*
+ * xfs_unmountfs
+ *
+ * This flushes out the inodes,dquots and the superblock, unmounts the
+ * log and makes sure that incore structures are freed.
+ */
+int
+xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
+{
+	int		ndquots;
+#if defined(DEBUG) || defined(INDUCE_IO_ERROR)
+	int64_t		fsid;
+#endif
+
+	xfs_iflush_all(mp, XFS_FLUSH_ALL);
+
+	/*
+	 * Purge the dquot cache.
+	 * None of the dquots should really be busy at this point.
+	 */
+	if (mp->m_quotainfo) {
+		while ((ndquots = xfs_qm_dqpurge_all(mp,
+						  XFS_QMOPT_UQUOTA|
+						  XFS_QMOPT_GQUOTA|
+						  XFS_QMOPT_UMOUNTING))) {
+			delay(ndquots * 10);
+		}
+	}
+
+	/*
+	 * Flush out the log synchronously so that we know for sure
+	 * that nothing is pinned.  This is important because bflush()
+	 * will skip pinned buffers.
+	 */
+	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+
+	xfs_binval(mp->m_ddev_targp);
+	if (mp->m_rtdev_targp) {
+		xfs_binval(mp->m_rtdev_targp);
+	}
+
+	xfs_unmountfs_writesb(mp);
+
+	xfs_log_unmount(mp);			/* Done! No more fs ops. */
+
+	xfs_freesb(mp);
+
+	/*
+	 * All inodes from this mount point should be freed.
+	 */
+	ASSERT(mp->m_inodes == NULL);
+
+	/*
+	 * We may have bufs that are in the process of getting written still.
+	 * We must wait for the I/O completion of those. The sync flag here
+	 * does a two pass iteration thru the bufcache.
+	 */
+	if (XFS_FORCED_SHUTDOWN(mp)) {
+		xfs_incore_relse(mp->m_ddev_targp, 0, 1); /* synchronous */
+	}
+
+	xfs_unmountfs_close(mp, cr);
+	if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0)
+		xfs_uuid_unmount(mp);
+
+#if defined(DEBUG) || defined(INDUCE_IO_ERROR)
+	/*
+	 * clear all error tags on this filesystem
+	 */
+	bcopy(&(XFS_MTOVFS(mp)->vfs_fsid), &fsid, sizeof(int64_t));
+	(void) xfs_errortag_clearall_umount(fsid, mp->m_fsname, 0);
+#endif
+
+	xfs_mount_free(mp, 1);
+	return 0;
+}
+
+void
+xfs_unmountfs_close(xfs_mount_t *mp, struct cred *cr)
+{
+	int		have_logdev = (mp->m_logdev_targp != mp->m_ddev_targp);
+
+	if (mp->m_ddev_targp) {
+		pagebuf_lock_disable(mp->m_ddev_targp, 0);
+		mp->m_ddev_targp = NULL;
+	}
+	if (mp->m_rtdev_targp) {
+		pagebuf_lock_disable(mp->m_rtdev_targp, 1);
+		mp->m_rtdev_targp = NULL;
+	}
+	if (mp->m_logdev_targp && have_logdev) {
+		pagebuf_lock_disable(mp->m_logdev_targp, 1);
+		mp->m_logdev_targp = NULL;
+	}
+}
+
+int
+xfs_unmountfs_writesb(xfs_mount_t *mp)
+{
+	xfs_buf_t	*sbp;
+	xfs_sb_t	*sb;
+	int		error = 0;
+
+	/*
+	 * skip superblock write if fs is read-only, or
+	 * if we are doing a forced umount.
+	 */
+	sbp = xfs_getsb(mp, 0);
+	if (!(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY ||
+		XFS_FORCED_SHUTDOWN(mp))) {
+		/*
+		 * mark shared-readonly if desired
+		 */
+		sb = XFS_BUF_TO_SBP(sbp);
+		if (mp->m_mk_sharedro) {
+			if (!(sb->sb_flags & XFS_SBF_READONLY))
+				sb->sb_flags |= XFS_SBF_READONLY;
+			if (!XFS_SB_VERSION_HASSHARED(sb))
+				XFS_SB_VERSION_ADDSHARED(sb);
+			xfs_fs_cmn_err(CE_NOTE, mp,
+				"Unmounting, marking shared read-only");
+		}
+		XFS_BUF_UNDONE(sbp);
+		XFS_BUF_UNREAD(sbp);
+		XFS_BUF_UNDELAYWRITE(sbp);
+		XFS_BUF_WRITE(sbp);
+		XFS_BUF_UNASYNC(sbp);
+		ASSERT(XFS_BUF_TARGET_DEV(sbp) == mp->m_dev);
+		xfsbdstrat(mp, sbp);
+		/* Nevermind errors we might get here. */
+		error = xfs_iowait(sbp);
+		if (error)
+			xfs_ioerror_alert("xfs_unmountfs_writesb",
+					  mp, sbp, XFS_BUF_ADDR(sbp));
+		if (error && mp->m_mk_sharedro)
+			xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting.	 Filesystem may not be marked shared readonly");
+	}
+	xfs_buf_relse(sbp);
+	return (error);
+}
+
+/*
+ * xfs_mod_sb() can be used to copy arbitrary changes to the
+ * in-core superblock into the superblock buffer to be logged.
+ * It does not provide the higher level of locking that is
+ * needed to protect the in-core superblock from concurrent
+ * access.
+ */
+void
+xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
+{
+	xfs_buf_t		*bp;
+	int		first;
+	int		last;
+	xfs_mount_t	*mp;
+	xfs_sb_t	*sbp;
+	xfs_sb_field_t	f;
+
+	ASSERT(fields);
+	if (!fields)
+		return;
+	mp = tp->t_mountp;
+	bp = xfs_trans_getsb(tp, mp, 0);
+	sbp = XFS_BUF_TO_SBP(bp);
+	first = sizeof(xfs_sb_t);
+	last = 0;
+
+	/* translate/copy */
+
+	xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), -1, ARCH_CONVERT, fields);
+
+	/* find modified range */
+
+	f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+	ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+	first = xfs_sb_info[f].offset;
+
+	f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
+	ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+	last = xfs_sb_info[f + 1].offset - 1;
+
+	xfs_trans_log_buf(tp, bp, first, last);
+}
+
+/*
+ * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
+ * a delta to a specified field in the in-core superblock.  Simply
+ * switch on the field indicated and apply the delta to that field.
+ * Fields are not allowed to dip below zero, so if the delta would
+ * do this do not apply it and return EINVAL.
+ *
+ * The SB_LOCK must be held when this routine is called.
+ */
+STATIC int
+xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
+			  int delta, int rsvd)
+{
+	int		scounter;	/* short counter for 32 bit fields */
+	long long	lcounter;	/* long counter for 64 bit fields */
+	long long res_used, rem;
+
+	/*
+	 * With the in-core superblock spin lock held, switch
+	 * on the indicated field.  Apply the delta to the
+	 * proper field.  If the fields value would dip below
+	 * 0, then do not apply the delta and return EINVAL.
+	 */
+	switch (field) {
+	case XFS_SBS_ICOUNT:
+		lcounter = (long long)mp->m_sb.sb_icount;
+		lcounter += delta;
+		if (lcounter < 0) {
+			ASSERT(0);
+			return (XFS_ERROR(EINVAL));
+		}
+		mp->m_sb.sb_icount = lcounter;
+		return (0);
+	case XFS_SBS_IFREE:
+		lcounter = (long long)mp->m_sb.sb_ifree;
+		lcounter += delta;
+		if (lcounter < 0) {
+			ASSERT(0);
+			return (XFS_ERROR(EINVAL));
+		}
+		mp->m_sb.sb_ifree = lcounter;
+		return (0);
+	case XFS_SBS_FDBLOCKS:
+
+		lcounter = (long long)mp->m_sb.sb_fdblocks;
+		res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
+
+		if (delta > 0) {		/* Putting blocks back */
+			if (res_used > delta) {
+				mp->m_resblks_avail += delta;
+			} else {
+				rem = delta - res_used;
+				mp->m_resblks_avail = mp->m_resblks;
+				lcounter += rem;
+			}
+		} else {				/* Taking blocks away */
+
+			lcounter += delta;
+
+		/*
+		 * If were out of blocks, use any available reserved blocks if
+		 * were allowed to.
+		 */
+
+			if (lcounter < 0) {
+				if (rsvd) {
+					lcounter = (long long)mp->m_resblks_avail + delta;
+					if (lcounter < 0) {
+						return (XFS_ERROR(ENOSPC));
+					}
+					mp->m_resblks_avail = lcounter;
+					return (0);
+				} else {	/* not reserved */
+					return (XFS_ERROR(ENOSPC));
+				}
+			}
+		}
+
+		mp->m_sb.sb_fdblocks = lcounter;
+		return (0);
+	case XFS_SBS_FREXTENTS:
+		lcounter = (long long)mp->m_sb.sb_frextents;
+		lcounter += delta;
+		if (lcounter < 0) {
+			return (XFS_ERROR(ENOSPC));
+		}
+		mp->m_sb.sb_frextents = lcounter;
+		return (0);
+	case XFS_SBS_DBLOCKS:
+		lcounter = (long long)mp->m_sb.sb_dblocks;
+		lcounter += delta;
+		if (lcounter < 0) {
+			ASSERT(0);
+			return (XFS_ERROR(EINVAL));
+		}
+		mp->m_sb.sb_dblocks = lcounter;
+		return (0);
+	case XFS_SBS_AGCOUNT:
+		scounter = mp->m_sb.sb_agcount;
+		scounter += delta;
+		if (scounter < 0) {
+			ASSERT(0);
+			return (XFS_ERROR(EINVAL));
+		}
+		mp->m_sb.sb_agcount = scounter;
+		return (0);
+	case XFS_SBS_IMAX_PCT:
+		scounter = mp->m_sb.sb_imax_pct;
+		scounter += delta;
+		if (scounter < 0) {
+			ASSERT(0);
+			return (XFS_ERROR(EINVAL));
+		}
+		mp->m_sb.sb_imax_pct = scounter;
+		return (0);
+	case XFS_SBS_REXTSIZE:
+		scounter = mp->m_sb.sb_rextsize;
+		scounter += delta;
+		if (scounter < 0) {
+			ASSERT(0);
+			return (XFS_ERROR(EINVAL));
+		}
+		mp->m_sb.sb_rextsize = scounter;
+		return (0);
+	case XFS_SBS_RBMBLOCKS:
+		scounter = mp->m_sb.sb_rbmblocks;
+		scounter += delta;
+		if (scounter < 0) {
+			ASSERT(0);
+			return (XFS_ERROR(EINVAL));
+		}
+		mp->m_sb.sb_rbmblocks = scounter;
+		return (0);
+	case XFS_SBS_RBLOCKS:
+		lcounter = (long long)mp->m_sb.sb_rblocks;
+		lcounter += delta;
+		if (lcounter < 0) {
+			ASSERT(0);
+			return (XFS_ERROR(EINVAL));
+		}
+		mp->m_sb.sb_rblocks = lcounter;
+		return (0);
+	case XFS_SBS_REXTENTS:
+		lcounter = (long long)mp->m_sb.sb_rextents;
+		lcounter += delta;
+		if (lcounter < 0) {
+			ASSERT(0);
+			return (XFS_ERROR(EINVAL));
+		}
+		mp->m_sb.sb_rextents = lcounter;
+		return (0);
+	case XFS_SBS_REXTSLOG:
+		scounter = mp->m_sb.sb_rextslog;
+		scounter += delta;
+		if (scounter < 0) {
+			ASSERT(0);
+			return (XFS_ERROR(EINVAL));
+		}
+		mp->m_sb.sb_rextslog = scounter;
+		return (0);
+	default:
+		ASSERT(0);
+		return (XFS_ERROR(EINVAL));
+	}
+}
+
+/*
+ * xfs_mod_incore_sb() is used to change a field in the in-core
+ * superblock structure by the specified delta.	 This modification
+ * is protected by the SB_LOCK.	 Just use the xfs_mod_incore_sb_unlocked()
+ * routine to do the work.
+ */
+int
+xfs_mod_incore_sb(xfs_mount_t *mp, xfs_sb_field_t field, int delta, int rsvd)
+{
+	unsigned long	s;
+	int	status;
+
+	s = XFS_SB_LOCK(mp);
+	status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
+	XFS_SB_UNLOCK(mp, s);
+	return (status);
+}
+
+/*
+ * xfs_mod_incore_sb_batch() is used to change more than one field
+ * in the in-core superblock structure at a time.  This modification
+ * is protected by a lock internal to this module.  The fields and
+ * changes to those fields are specified in the array of xfs_mod_sb
+ * structures passed in.
+ *
+ * Either all of the specified deltas will be applied or none of
+ * them will.  If any modified field dips below 0, then all modifications
+ * will be backed out and EINVAL will be returned.
+ */
+int
+xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd)
+{
+	unsigned long		s;
+	int		status=0;
+	xfs_mod_sb_t	*msbp;
+
+	/*
+	 * Loop through the array of mod structures and apply each
+	 * individually.  If any fail, then back out all those
+	 * which have already been applied.  Do all of this within
+	 * the scope of the SB_LOCK so that all of the changes will
+	 * be atomic.
+	 */
+	s = XFS_SB_LOCK(mp);
+	msbp = &msb[0];
+	for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) {
+		/*
+		 * Apply the delta at index n.	If it fails, break
+		 * from the loop so we'll fall into the undo loop
+		 * below.
+		 */
+		status = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
+						    msbp->msb_delta, rsvd);
+		if (status != 0) {
+			break;
+		}
+	}
+
+	/*
+	 * If we didn't complete the loop above, then back out
+	 * any changes made to the superblock.	If you add code
+	 * between the loop above and here, make sure that you
+	 * preserve the value of status. Loop back until
+	 * we step below the beginning of the array.  Make sure
+	 * we don't touch anything back there.
+	 */
+	if (status != 0) {
+		msbp--;
+		while (msbp >= msb) {
+			status = xfs_mod_incore_sb_unlocked(mp,
+				    msbp->msb_field, -(msbp->msb_delta), rsvd);
+			ASSERT(status == 0);
+			msbp--;
+		}
+	}
+	XFS_SB_UNLOCK(mp, s);
+	return (status);
+}
+
+/*
+ * xfs_getsb() is called to obtain the buffer for the superblock.
+ * The buffer is returned locked and read in from disk.
+ * The buffer should be released with a call to xfs_brelse().
+ *
+ * If the flags parameter is BUF_TRYLOCK, then we'll only return
+ * the superblock buffer if it can be locked without sleeping.
+ * If it can't then we'll return NULL.
+ */
+xfs_buf_t *
+xfs_getsb(xfs_mount_t	*mp,
+	  int		flags)
+{
+	xfs_buf_t	*bp;
+	ASSERT(mp->m_sb_bp != NULL);
+	bp = mp->m_sb_bp;
+	if (flags & XFS_BUF_TRYLOCK) {
+		if (!XFS_BUF_CPSEMA(bp)) {
+			return NULL;
+		}
+	} else {
+		XFS_BUF_PSEMA(bp, PRIBIO);
+	}
+	XFS_BUF_HOLD(bp);
+	ASSERT(XFS_BUF_ISDONE(bp));
+	return (bp);
+}
+
+/*
+ * Used to free the superblock along various error paths.
+ */
+void
+xfs_freesb(
+	xfs_mount_t	*mp)
+{
+	xfs_buf_t	*bp;
+
+	/*
+	 * Use xfs_getsb() so that the buffer will be locked
+	 * when we call nfreerbuf().
+	 */
+	bp = xfs_getsb(mp, 0);
+	bp->pb_flags &= ~PBF_FS_MANAGED;
+	xfs_buf_relse(bp);
+	mp->m_sb_bp = NULL;
+}
+
+/*
+ * See if the uuid is unique among mounted xfs filesystems.
+ * Mount fails if UUID is nil or a FS with the same UUID is already
+ * mounted
+ */
+STATIC int
+xfs_uuid_mount(xfs_mount_t *mp)
+{
+	int	hole;
+	int	i;
+
+	if (uuid_is_nil(&mp->m_sb.sb_uuid)) {
+		cmn_err(CE_WARN, "XFS: Filesystem %s has nil UUID - can't mount",
+			mp->m_fsname);
+		return -1;
+	}
+
+	mutex_lock(&xfs_uuidtabmon, PVFS);
+	for (i = 0, hole = -1; i < xfs_uuidtab_size; i++) {
+		if (uuid_is_nil(&xfs_uuidtab[i])) {
+			hole = i;
+			continue;
+		}
+		if (uuid_equal(&mp->m_sb.sb_uuid, &xfs_uuidtab[i])) {
+			cmn_err(CE_WARN, "XFS: Filesystem %s has duplicate UUID - can't mount",
+				mp->m_fsname);
+			mutex_unlock(&xfs_uuidtabmon);
+			return -1;
+		}
+	}
+	if (hole < 0) {
+		xfs_uuidtab = kmem_realloc(xfs_uuidtab,
+			(xfs_uuidtab_size + 1) * sizeof(*xfs_uuidtab),
+			xfs_uuidtab_size  * sizeof(*xfs_uuidtab),
+			KM_SLEEP);
+		hole = xfs_uuidtab_size++;
+	}
+	xfs_uuidtab[hole] = mp->m_sb.sb_uuid;
+	mutex_unlock(&xfs_uuidtabmon);
+
+	return 0;
+}
+
+/*
+ * Remove filesystem from the uuid table.
+ */
+STATIC void
+xfs_uuid_unmount(xfs_mount_t *mp)
+{
+	int	i;
+
+	mutex_lock(&xfs_uuidtabmon, PVFS);
+	for (i = 0; i < xfs_uuidtab_size; i++) {
+		if (uuid_is_nil(&xfs_uuidtab[i]))
+			continue;
+		if (!uuid_equal(&mp->m_sb.sb_uuid, &xfs_uuidtab[i]))
+			continue;
+		uuid_create_nil(&xfs_uuidtab[i]);
+		break;
+	}
+	ASSERT(i < xfs_uuidtab_size);
+	mutex_unlock(&xfs_uuidtabmon);
+}
+
+/*
+ * When xfsquotas isn't installed and the superblock had quotas, we need to
+ * clear the quotaflags from superblock.
+ */
+STATIC void
+xfs_mount_reset_sbqflags(
+	xfs_mount_t	*mp)
+{
+	xfs_trans_t	*tp;
+	unsigned long		s;
+
+	mp->m_qflags = 0;
+	/*
+	 * It is OK to look at sb_qflags here in mount path,
+	 * without SB_LOCK.
+	 */
+	if (mp->m_sb.sb_qflags == 0)
+		return;
+	s = XFS_SB_LOCK(mp);
+	mp->m_sb.sb_qflags = 0;
+	XFS_SB_UNLOCK(mp, s);
+
+	/*
+	 * if the fs is readonly, let the incore superblock run
+	 * with quotas off but don't flush the update out to disk
+	 */
+	if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY)
+		return;
+#ifdef QUOTADEBUG
+	xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes");
+#endif
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
+	if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+				      XFS_DEFAULT_LOG_COUNT)) {
+		xfs_trans_cancel(tp, 0);
+		return;
+	}
+	xfs_mod_sb(tp, XFS_SB_QFLAGS);
+	(void)xfs_trans_commit(tp, 0, NULL);
+}
+
+/*
+ * Used to log changes to the superblock unit and width fields which could
+ * be altered by the mount options. Only the first superblock is updated.
+ */
+STATIC void
+xfs_mount_log_sbunit(
+	xfs_mount_t *mp,
+	__int64_t fields)
+{
+	xfs_trans_t *tp;
+
+	ASSERT(fields & (XFS_SB_UNIT|XFS_SB_WIDTH|XFS_SB_UUID));
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
+	if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+				XFS_DEFAULT_LOG_COUNT)) {
+		xfs_trans_cancel(tp, 0);
+		return;
+	}
+	xfs_mod_sb(tp, fields);
+	(void)xfs_trans_commit(tp, 0, NULL);
+}
+
+/* Functions to lock access out of the filesystem for forced
+ * shutdown or snapshot.
+ */
+
+void
+xfs_start_freeze(
+	xfs_mount_t	*mp,
+	int		level)
+{
+	unsigned long	s = mutex_spinlock(&mp->m_freeze_lock);
+
+	mp->m_frozen = level;
+	mutex_spinunlock(&mp->m_freeze_lock, s);
+
+	if (level == XFS_FREEZE_TRANS) {
+		while (atomic_read(&mp->m_active_trans) > 0)
+			delay(100);
+	}
+}
+
+void
+xfs_finish_freeze(
+	xfs_mount_t *mp)
+{
+	unsigned long	s = mutex_spinlock(&mp->m_freeze_lock);
+
+	if (mp->m_frozen) {
+		mp->m_frozen = 0;
+		sv_broadcast(&mp->m_wait_unfreeze);
+	}
+
+	mutex_spinunlock(&mp->m_freeze_lock, s);
+}
+
+void
+xfs_check_frozen(
+	xfs_mount_t *mp,
+	bhv_desc_t *bdp,
+	int	level)
+{
+	SPLDECL(s);
+
+	if (mp->m_frozen) {
+		s = mutex_spinlock(&mp->m_freeze_lock);
+
+		if (mp->m_frozen < level) {
+			mutex_spinunlock(&mp->m_freeze_lock, s);
+		} else {
+			sv_wait(&mp->m_wait_unfreeze, 0, &mp->m_freeze_lock, s);
+		}
+	}
+
+	if (level == XFS_FREEZE_TRANS)
+		atomic_inc(&mp->m_active_trans);
+}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
new file mode 100644
index 000000000000..51c86fea20c4
--- /dev/null
+++ b/fs/xfs/xfs_mount.h
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_MOUNT_H__
+#define __XFS_MOUNT_H__
+
+
+typedef struct xfs_trans_reservations {
+	uint	tr_write;	/* extent alloc trans */
+	uint	tr_itruncate;	/* truncate trans */
+	uint	tr_rename;	/* rename trans */
+	uint	tr_link;	/* link trans */
+	uint	tr_remove;	/* unlink trans */
+	uint	tr_symlink;	/* symlink trans */
+	uint	tr_create;	/* create trans */
+	uint	tr_mkdir;	/* mkdir trans */
+	uint	tr_ifree;	/* inode free trans */
+	uint	tr_ichange;	/* inode update trans */
+	uint	tr_growdata;	/* fs data section grow trans */
+	uint	tr_swrite;	/* sync write inode trans */
+	uint	tr_addafork;	/* cvt inode to attributed trans */
+	uint	tr_writeid;	/* write setuid/setgid file */
+	uint	tr_attrinval;	/* attr fork buffer invalidation */
+	uint	tr_attrset;	/* set/create an attribute */
+	uint	tr_attrrm;	/* remove an attribute */
+	uint	tr_clearagi;	/* clear bad agi unlinked ino bucket */
+	uint	tr_growrtalloc; /* grow realtime allocations */
+	uint	tr_growrtzero;	/* grow realtime zeroing */
+	uint	tr_growrtfree;	/* grow realtime freeing */
+} xfs_trans_reservations_t;
+
+
+#ifndef __KERNEL__
+/*
+ * Moved here from xfs_ag.h to avoid reordering header files
+ */
+#define XFS_DADDR_TO_AGNO(mp,d) \
+	((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
+#define XFS_DADDR_TO_AGBNO(mp,d) \
+	((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
+#else
+struct cred;
+struct vfs;
+struct vnode;
+struct xfs_mount_args;
+struct xfs_ihash;
+struct xfs_chash;
+struct xfs_inode;
+struct xfs_perag;
+struct xfs_quotainfo;
+struct xfs_iocore;
+struct xfs_dio;
+struct xfs_bmbt_irec;
+struct xfs_bmap_free;
+
+#define SPLDECL(s)		unsigned long s
+#define AIL_LOCK_T		lock_t
+#define AIL_LOCKINIT(x,y)	spinlock_init(x,y)
+#define AIL_LOCK_DESTROY(x)	spinlock_destroy(x)
+#define AIL_LOCK(mp,s)		s=mutex_spinlock(&(mp)->m_ail_lock)
+#define AIL_UNLOCK(mp,s)	mutex_spinunlock(&(mp)->m_ail_lock, s)
+
+
+/* Prototypes and functions for I/O core modularization, a vector
+ * of functions is used to indirect from xfs/cxfs independent code
+ * to the xfs/cxfs dependent code.
+ * The vector is placed in the mount structure so that we can
+ * minimize the number of memory indirections involved.
+ */
+
+typedef int		(*xfs_dio_write_t)(struct xfs_dio *);
+typedef int		(*xfs_dio_read_t)(struct xfs_dio *);
+typedef int		(*xfs_strat_write_t)(struct xfs_iocore *, struct xfs_buf *);
+typedef int		(*xfs_bmapi_t)(struct xfs_trans *, void *,
+				xfs_fileoff_t, xfs_filblks_t, int,
+				xfs_fsblock_t *, xfs_extlen_t,
+				struct xfs_bmbt_irec *, int *,
+				struct xfs_bmap_free *);
+typedef int		(*xfs_bmap_eof_t)(void *, xfs_fileoff_t, int, int *);
+typedef int		(*xfs_rsync_t)(void *, int, xfs_off_t, xfs_off_t);
+typedef uint		(*xfs_lck_map_shared_t)(void *);
+typedef void		(*xfs_lock_t)(void *, uint);
+typedef void		(*xfs_lock_demote_t)(void *, uint);
+typedef int		(*xfs_lock_nowait_t)(void *, uint);
+typedef void		(*xfs_unlk_t)(void *, unsigned int);
+typedef void		(*xfs_chgtime_t)(void *, int);
+typedef xfs_fsize_t	(*xfs_size_t)(void *);
+typedef xfs_fsize_t	(*xfs_setsize_t)(void *, xfs_off_t);
+typedef xfs_fsize_t	(*xfs_lastbyte_t)(void *);
+
+typedef struct xfs_ioops {
+	xfs_bmapi_t		xfs_bmapi_func;
+	xfs_bmap_eof_t		xfs_bmap_eof_func;
+	xfs_lock_t		xfs_ilock;
+	xfs_lock_demote_t	xfs_ilock_demote;
+	xfs_lock_nowait_t	xfs_ilock_nowait;
+	xfs_unlk_t		xfs_unlock;
+	xfs_chgtime_t		xfs_chgtime;
+	xfs_size_t		xfs_size_func;
+	xfs_lastbyte_t		xfs_lastbyte;
+} xfs_ioops_t;
+
+
+#define XFS_BMAPI(mp, trans,io,bno,len,f,first,tot,mval,nmap,flist)	\
+	(*(mp)->m_io_ops.xfs_bmapi_func) \
+		(trans,(io)->io_obj,bno,len,f,first,tot,mval,nmap,flist)
+
+#define XFS_BMAP_EOF(mp, io, endoff, whichfork, eof) \
+	(*(mp)->m_io_ops.xfs_bmap_eof_func) \
+		((io)->io_obj, endoff, whichfork, eof)
+
+#define XFS_ILOCK(mp, io, mode) \
+	(*(mp)->m_io_ops.xfs_ilock)((io)->io_obj, mode)
+
+#define XFS_IUNLOCK(mp, io, mode) \
+	(*(mp)->m_io_ops.xfs_unlock)((io)->io_obj, mode)
+
+#define XFS_ILOCK_DEMOTE(mp, io, mode) \
+	(*(mp)->m_io_ops.xfs_ilock_demote)((io)->io_obj, mode)
+
+#define XFS_SIZE(mp, io) \
+	(*(mp)->m_io_ops.xfs_size_func)((io)->io_obj)
+
+#define XFS_LASTBYTE(mp, io) \
+	(*(mp)->m_io_ops.xfs_lastbyte)((io)->io_obj)
+
+
+typedef struct xfs_mount {
+	bhv_desc_t		m_bhv;		/* vfs xfs behavior */
+	xfs_tid_t		m_tid;		/* next unused tid for fs */
+	AIL_LOCK_T		m_ail_lock;	/* fs AIL mutex */
+	xfs_ail_entry_t		m_ail;		/* fs active log item list */
+	uint			m_ail_gen;	/* fs AIL generation count */
+	xfs_sb_t		m_sb;		/* copy of fs superblock */
+	lock_t			m_sb_lock;	/* sb counter mutex */
+	struct xfs_buf		*m_sb_bp;	/* buffer for superblock */
+	char			*m_fsname;	/* filesystem name */
+	int			m_fsname_len;	/* strlen of fs name */
+	int			m_bsize;	/* fs logical block size */
+	xfs_agnumber_t		m_agfrotor;	/* last ag where space found */
+	xfs_agnumber_t		m_agirotor;	/* last ag dir inode alloced */
+	xfs_agnumber_t		m_maxagi;	/* highest inode alloc group */
+	int			m_ihsize;	/* size of next field */
+	struct xfs_ihash	*m_ihash;	/* fs private inode hash table*/
+	struct xfs_inode	*m_inodes;	/* active inode list */
+	mutex_t			m_ilock;	/* inode list mutex */
+	uint			m_ireclaims;	/* count of calls to reclaim*/
+	uint			m_readio_log;	/* min read size log bytes */
+	uint			m_readio_blocks; /* min read size blocks */
+	uint			m_writeio_log;	/* min write size log bytes */
+	uint			m_writeio_blocks; /* min write size blocks */
+	void			*m_log;		/* log specific stuff */
+	int			m_logbufs;	/* number of log buffers */
+	int			m_logbsize;	/* size of each log buffer */
+	uint			m_rsumlevels;	/* rt summary levels */
+	uint			m_rsumsize;	/* size of rt summary, bytes */
+	struct xfs_inode	*m_rbmip;	/* pointer to bitmap inode */
+	struct xfs_inode	*m_rsumip;	/* pointer to summary inode */
+	struct xfs_inode	*m_rootip;	/* pointer to root directory */
+	struct xfs_quotainfo	*m_quotainfo;	/* disk quota information */
+	xfs_buftarg_t		*m_ddev_targp;	/* saves taking the address */
+	xfs_buftarg_t		*m_logdev_targp;/* ptr to log device */
+	xfs_buftarg_t		*m_rtdev_targp;	/* ptr to rt device */
+#define m_dev		m_ddev_targp->pbr_dev
+	__uint8_t		m_dircook_elog; /* log d-cookie entry bits */
+	__uint8_t		m_blkbit_log;	/* blocklog + NBBY */
+	__uint8_t		m_blkbb_log;	/* blocklog - BBSHIFT */
+	__uint8_t		m_agno_log;	/* log #ag's */
+	__uint8_t		m_agino_log;	/* #bits for agino in inum */
+	__uint8_t		m_nreadaheads;	/* #readahead buffers */
+	__uint16_t		m_inode_cluster_size;/* min inode buf size */
+	uint			m_blockmask;	/* sb_blocksize-1 */
+	uint			m_blockwsize;	/* sb_blocksize in words */
+	uint			m_blockwmask;	/* blockwsize-1 */
+	uint			m_alloc_mxr[2]; /* XFS_ALLOC_BLOCK_MAXRECS */
+	uint			m_alloc_mnr[2]; /* XFS_ALLOC_BLOCK_MINRECS */
+	uint			m_bmap_dmxr[2]; /* XFS_BMAP_BLOCK_DMAXRECS */
+	uint			m_bmap_dmnr[2]; /* XFS_BMAP_BLOCK_DMINRECS */
+	uint			m_inobt_mxr[2]; /* XFS_INOBT_BLOCK_MAXRECS */
+	uint			m_inobt_mnr[2]; /* XFS_INOBT_BLOCK_MINRECS */
+	uint			m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
+	uint			m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
+	uint			m_in_maxlevels; /* XFS_IN_MAXLEVELS */
+	struct xfs_perag	*m_perag;	/* per-ag accounting info */
+	struct rw_semaphore	m_peraglock;	/* lock for m_perag (pointer) */
+	sema_t			m_growlock;	/* growfs mutex */
+	int			m_fixedfsid[2]; /* unchanged for life of FS */
+	uint			m_dmevmask;	/* DMI events for this FS */
+	uint			m_flags;	/* global mount flags */
+	uint			m_attroffset;	/* inode attribute offset */
+	int			m_da_node_ents; /* how many entries in danode */
+	int			m_ialloc_inos;	/* inodes in inode allocation */
+	int			m_ialloc_blks;	/* blocks in inode allocation */
+	int			m_litino;	/* size of inode union area */
+	int			m_inoalign_mask;/* mask sb_inoalignmt if used */
+	uint			m_qflags;	/* quota status flags */
+	xfs_trans_reservations_t m_reservations;/* precomputed res values */
+	__uint64_t		m_maxicount;	/* maximum inode count */
+	__uint64_t		m_resblks;	/* total reserved blocks */
+	__uint64_t		m_resblks_avail;/* available reserved blocks */
+#if XFS_BIG_FILESYSTEMS
+	xfs_ino_t		m_inoadd;	/* add value for ino64_offset */
+#endif
+	int			m_dalign;	/* stripe unit */
+	int			m_swidth;	/* stripe width */
+	int			m_lstripemask;	/* log stripe mask */
+	int			m_sinoalign;	/* stripe unit inode alignmnt */
+	int			m_attr_magicpct;/* 37% of the blocksize */
+	int			m_dir_magicpct; /* 37% of the dir blocksize */
+	__uint8_t		m_mk_sharedro;	/* mark shared ro on unmount */
+	__uint8_t		m_inode_quiesce;/* call quiesce on new inodes.
+						   field governed by m_ilock */
+	__uint8_t		m_dirversion;	/* 1 or 2 */
+	xfs_dirops_t		m_dirops;	/* table of dir funcs */
+	int			m_dirblksize;	/* directory block sz--bytes */
+	int			m_dirblkfsbs;	/* directory block sz--fsbs */
+	xfs_dablk_t		m_dirdatablk;	/* blockno of dir data v2 */
+	xfs_dablk_t		m_dirleafblk;	/* blockno of dir non-data v2 */
+	xfs_dablk_t		m_dirfreeblk;	/* blockno of dirfreeindex v2 */
+	int			m_chsize;	/* size of next field */
+	struct xfs_chash	*m_chash;	/* fs private inode per-cluster
+						 * hash table */
+	struct xfs_ioops	m_io_ops;	/* vector of I/O ops */
+	struct xfs_expinfo	*m_expinfo;	/* info to export to other
+						   cells. */
+	uint64_t		m_shadow_pinmask;
+						/* which bits matter in rpc
+						   log item pin masks */
+	uint			m_cxfstype;	/* mounted shared, etc. */
+	lock_t			m_freeze_lock;	/* Lock for m_frozen */
+	uint			m_frozen;	/* FS frozen for shutdown or
+						 * snapshot */
+	sv_t			m_wait_unfreeze;/* waiting to unfreeze */
+	atomic_t		m_active_trans; /* number trans frozen */
+	struct timer_list	m_sbdirty_timer;/* superblock dirty timer
+						 * for nfs refcache */
+} xfs_mount_t;
+
+/*
+ * Flags for m_flags.
+ */
+#define XFS_MOUNT_WSYNC		0x00000001	/* for nfs - all metadata ops
+						   must be synchronous except
+						   for space allocations */
+#if XFS_BIG_FILESYSTEMS
+#define XFS_MOUNT_INO64		0x00000002
+#endif
+#define XFS_MOUNT_ROOTQCHECK	0x00000004
+			     /* 0x00000008	-- currently unused */
+#define XFS_MOUNT_FS_SHUTDOWN	0x00000010	/* atomic stop of all filesystem
+						   operations, typically for
+						   disk errors in metadata */
+#define XFS_MOUNT_NOATIME	0x00000020	/* don't modify inode access
+						   times on reads */
+#define XFS_MOUNT_RETERR	0x00000040	/* return alignment errors to
+						   user */
+#define XFS_MOUNT_NOALIGN	0x00000080	/* turn off stripe alignment
+						   allocations */
+			     /* 0x00000100	-- currently unused */
+#define XFS_MOUNT_REGISTERED	0x00000200	/* registered with cxfs master
+						   cell logic */
+#define XFS_MOUNT_NORECOVERY	0x00000400	/* no recovery - dirty fs */
+#define XFS_MOUNT_SHARED	0x00000800	/* shared mount */
+#define XFS_MOUNT_DFLT_IOSIZE	0x00001000	/* set default i/o size */
+#define XFS_MOUNT_OSYNCISOSYNC	0x00002000	/* o_sync is REALLY o_sync */
+						/* osyncisdsync is now default*/
+#define XFS_MOUNT_NOUUID	0x00004000	/* ignore uuid during mount */
+#define XFS_MOUNT_32BITINODES	0x00008000	/* do not create inodes above
+						 * 32 bits in size */
+#define XFS_MOUNT_IRIXSGID	0x00010000	/* Irix-style sgid inheritance */
+#define XFS_MOUNT_NOLOGFLUSH	0x00020000
+
+/*
+ * Flags for m_cxfstype
+ */
+#define XFS_CXFS_NOT		0x00000001	/* local mount */
+#define XFS_CXFS_SERVER		0x00000002	/* we're the CXFS server */
+#define XFS_CXFS_CLIENT		0x00000004	/* We're a CXFS client */
+#define XFS_CXFS_REC_ENABLED	0x00000008	/* recovery is enabled */
+
+#define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN)
+
+/*
+ * Default minimum read and write sizes.
+ */
+#define XFS_READIO_LOG_LARGE	12
+#define XFS_WRITEIO_LOG_LARGE	12
+/*
+ * Default allocation size
+ */
+#define XFS_WRITE_IO_LOG	16
+
+/*
+ * Max and min values for UIO and mount-option defined I/O sizes;
+ * min value can't be less than a page.	 Currently unused.
+ */
+#define XFS_MAX_IO_LOG		16	/* 64K */
+#define XFS_MIN_IO_LOG		PAGE_SHIFT
+
+/*
+ * Synchronous read and write sizes.  This should be
+ * better for NFSv2 wsync filesystems.
+ */
+#define XFS_WSYNC_READIO_LOG	15	/* 32K */
+#define XFS_WSYNC_WRITEIO_LOG	14	/* 16K */
+
+#define xfs_force_shutdown(m,f)	VFS_FORCE_SHUTDOWN(XFS_MTOVFS(m),f)
+/*
+ * Flags sent to xfs_force_shutdown.
+ */
+#define XFS_METADATA_IO_ERROR	0x1
+#define XFS_LOG_IO_ERROR	0x2
+#define XFS_FORCE_UMOUNT	0x4
+#define XFS_CORRUPT_INCORE	0x8	/* corrupt in-memory data structures */
+#define XFS_SHUTDOWN_REMOTE_REQ 0x10	/* shutdown came from remote cell */
+
+/*
+ * xflags for xfs_syncsub
+ */
+#define XFS_XSYNC_RELOC		0x01
+
+/*
+ * Flags for xfs_mountfs
+ */
+#define XFS_MFSI_SECOND		0x01	/* Is a cxfs secondary mount -- skip */
+					/* stuff which should only be done */
+					/* once. */
+#define XFS_MFSI_CLIENT		0x02	/* Is a client -- skip lots of stuff */
+#define XFS_MFSI_NOUNLINK	0x08	/* Skip unlinked inode processing in */
+					/* log recovery */
+
+/*
+ * Macros for getting from mount to vfs and back.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_MTOVFS)
+struct vfs *xfs_mtovfs(xfs_mount_t *mp);
+#define XFS_MTOVFS(mp)		xfs_mtovfs(mp)
+#else
+#define XFS_MTOVFS(mp)		(bhvtovfs(&(mp)->m_bhv))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BHVTOM)
+xfs_mount_t *xfs_bhvtom(bhv_desc_t *bdp);
+#define XFS_BHVTOM(bdp) xfs_bhvtom(bdp)
+#else
+#define XFS_BHVTOM(bdp)		((xfs_mount_t *)BHV_PDATA(bdp))
+#endif
+
+
+/*
+ * Moved here from xfs_ag.h to avoid reordering header files
+ */
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DADDR_TO_AGNO)
+xfs_agnumber_t xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d);
+#define XFS_DADDR_TO_AGNO(mp,d)		xfs_daddr_to_agno(mp,d)
+#else
+
+static inline xfs_agnumber_t XFS_DADDR_TO_AGNO(xfs_mount_t *mp, xfs_daddr_t d)
+{
+	d = XFS_BB_TO_FSBT(mp, d);
+	do_div(d, mp->m_sb.sb_agblocks);
+	return (xfs_agnumber_t) d;
+}
+
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DADDR_TO_AGBNO)
+xfs_agblock_t xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d);
+#define XFS_DADDR_TO_AGBNO(mp,d)	xfs_daddr_to_agbno(mp,d)
+#else
+
+static inline xfs_agblock_t XFS_DADDR_TO_AGBNO(xfs_mount_t *mp, xfs_daddr_t d)
+{
+	d = XFS_BB_TO_FSBT(mp, d);
+	return (xfs_agblock_t) do_div(d, mp->m_sb.sb_agblocks);
+}
+
+#endif
+
+/*
+ * This structure is for use by the xfs_mod_incore_sb_batch() routine.
+ */
+typedef struct xfs_mod_sb {
+	xfs_sb_field_t	msb_field;	/* Field to modify, see below */
+	int		msb_delta;	/* change to make to the specified field */
+} xfs_mod_sb_t;
+
+#define XFS_MOUNT_ILOCK(mp)	mutex_lock(&((mp)->m_ilock), PINOD)
+#define XFS_MOUNT_IUNLOCK(mp)	mutex_unlock(&((mp)->m_ilock))
+#define XFS_SB_LOCK(mp)		mutex_spinlock(&(mp)->m_sb_lock)
+#define XFS_SB_UNLOCK(mp,s)	mutex_spinunlock(&(mp)->m_sb_lock,(s))
+
+void		xfs_mod_sb(xfs_trans_t *, __int64_t);
+xfs_mount_t	*xfs_mount_init(void);
+void		xfs_mount_free(xfs_mount_t *mp, int remove_bhv);
+int		xfs_mountfs(struct vfs *, xfs_mount_t *mp, dev_t, int);
+
+int		xfs_unmountfs(xfs_mount_t *, struct cred *);
+void		xfs_unmountfs_close(xfs_mount_t *, struct cred *);
+int		xfs_unmountfs_writesb(xfs_mount_t *);
+int		xfs_unmount_flush(xfs_mount_t *, int);
+int		xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int, int);
+int		xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, uint, int);
+int		xfs_readsb(xfs_mount_t *mp);
+struct xfs_buf	*xfs_getsb(xfs_mount_t *, int);
+void		xfs_freesb(xfs_mount_t *);
+void		xfs_do_force_shutdown(bhv_desc_t *, int, char *, int);
+int		xfs_syncsub(xfs_mount_t *, int, int, int *);
+void		xfs_initialize_perag(xfs_mount_t *, int);
+void		xfs_xlatesb(void *, struct xfs_sb *, int, xfs_arch_t, __int64_t);
+
+/*
+ * Flags for freeze operations.
+ */
+#define XFS_FREEZE_WRITE	1
+#define XFS_FREEZE_TRANS	2
+
+void		xfs_start_freeze(xfs_mount_t *, int);
+void		xfs_finish_freeze(xfs_mount_t *);
+void		xfs_check_frozen(xfs_mount_t *, bhv_desc_t *, int);
+
+extern	struct vfsops xfs_vfsops;
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
new file mode 100644
index 000000000000..db7f44f0eb52
--- /dev/null
+++ b/fs/xfs/xfs_qm.c
@@ -0,0 +1,2899 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <xfs_quota_priv.h>
+
+
+kmem_zone_t	*qm_dqzone;
+kmem_zone_t	*qm_dqtrxzone;
+
+STATIC void	xfs_qm_list_init(xfs_dqlist_t *, char *, int);
+STATIC void	xfs_qm_list_destroy(xfs_dqlist_t *);
+STATIC int	xfs_qm_quotacheck(xfs_mount_t *);
+
+STATIC int	xfs_qm_init_quotainos(xfs_mount_t *);
+STATIC void	xfs_qm_shake(void);
+
+#ifdef DEBUG
+extern mutex_t	qcheck_lock;
+#endif
+
+#ifdef QUOTADEBUG
+#define XQM_LIST_PRINT(l, NXT, title) \
+{ \
+	  xfs_dquot_t	*dqp; int i = 0;\
+	  printk("%s (#%d)\n", title, (int) (l)->qh_nelems); \
+	  for (dqp = (l)->qh_next; dqp != NULL; dqp = dqp->NXT) { \
+	    printk("\t%d.\t\"%d (%s)\"\t bcnt = %d, icnt = %d refs = %d\n", \
+			 ++i, (int) INT_GET(dqp->q_core.d_id, ARCH_CONVERT), \
+			 DQFLAGTO_TYPESTR(dqp),	     \
+			 (int) INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT), \
+			 (int) INT_GET(dqp->q_core.d_icount, ARCH_CONVERT), \
+			 (int) dqp->q_nrefs);  } \
+}
+#endif
+
+/*
+ * Initialize the XQM structure.
+ * Note that there is not one quota manager per file system.
+ */
+struct xfs_qm *
+xfs_qm_init(void)
+{
+	xfs_qm_t		*xqm;
+	int			hsize, i;
+
+	xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP);
+	ASSERT(xqm);
+
+	/*
+	 * Initialize the dquot hash tables.
+	 */
+	hsize = (DQUOT_HASH_HEURISTIC < XFS_QM_NCSIZE_THRESHOLD) ?
+		XFS_QM_HASHSIZE_LOW : XFS_QM_HASHSIZE_HIGH;
+	xqm->qm_dqhashmask = hsize - 1;
+
+	/*
+	 * XXXsup We could keep reference counts on usr and grp quotas
+	 * inside XQM separately, and avoid having two hashtables even
+	 * when only one 'type' is active in the system.
+	 */
+	xqm->qm_usr_dqhtable = (xfs_dqhash_t *)kmem_zalloc(hsize *
+						      sizeof(xfs_dqhash_t),
+						      KM_SLEEP);
+	xqm->qm_grp_dqhtable = (xfs_dqhash_t *)kmem_zalloc(hsize *
+						      sizeof(xfs_dqhash_t),
+						      KM_SLEEP);
+	ASSERT(xqm->qm_usr_dqhtable != NULL);
+	ASSERT(xqm->qm_grp_dqhtable != NULL);
+
+	for (i = 0; i < hsize; i++) {
+		xfs_qm_list_init(&(xqm->qm_usr_dqhtable[i]), "uxdqh", i);
+		xfs_qm_list_init(&(xqm->qm_grp_dqhtable[i]), "gxdqh", i);
+	}
+
+	/*
+	 * Freelist of all dquots of all file systems
+	 */
+	xfs_qm_freelist_init(&(xqm->qm_dqfreelist));
+
+	/*
+	 * dquot zone. we register our own low-memory callback.
+	 */
+	if (!qm_dqzone) {
+		xqm->qm_dqzone = kmem_zone_init(sizeof(xfs_dquot_t),
+						"xfs_dquots");
+		qm_dqzone = xqm->qm_dqzone;
+	} else
+		xqm->qm_dqzone = qm_dqzone;
+
+	kmem_shake_register(xfs_qm_shake);
+
+	/*
+	 * The t_dqinfo portion of transactions.
+	 */
+	if (!qm_dqtrxzone) {
+		xqm->qm_dqtrxzone = kmem_zone_init(sizeof(xfs_dquot_acct_t),
+						   "xfs_dqtrx");
+		qm_dqtrxzone = xqm->qm_dqtrxzone;
+	} else
+		xqm->qm_dqtrxzone = qm_dqtrxzone;
+
+	atomic_set(&xqm->qm_totaldquots, 0);
+	xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO;
+	xqm->qm_nrefs = 0;
+#ifdef DEBUG
+	mutex_init(&qcheck_lock, MUTEX_DEFAULT, "qchk");
+#endif
+	return (xqm);
+}
+
+/*
+ * Destroy the global quota manager when its reference count goes to zero.
+ */
+void
+xfs_qm_destroy(
+	struct xfs_qm *xqm)
+{
+	int	hsize, i;
+
+	ASSERT(xqm != NULL);
+	ASSERT(xqm->qm_nrefs == 0);
+	kmem_shake_deregister(xfs_qm_shake);
+	hsize = xqm->qm_dqhashmask + 1;
+	for (i = 0; i < hsize; i++) {
+		xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
+		xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
+	}
+	kmem_free(xqm->qm_usr_dqhtable, hsize * sizeof(xfs_dqhash_t));
+	kmem_free(xqm->qm_grp_dqhtable, hsize * sizeof(xfs_dqhash_t));
+	xqm->qm_usr_dqhtable = NULL;
+	xqm->qm_grp_dqhtable = NULL;
+	xqm->qm_dqhashmask = 0;
+	xfs_qm_freelist_destroy(&(xqm->qm_dqfreelist));
+#ifdef DEBUG
+	mutex_destroy(&qcheck_lock);
+#endif
+	kmem_free(xqm, sizeof(xfs_qm_t));
+}
+
+/*
+ * Called at mount time to let XQM know that another file system is
+ * starting quotas. This isn't crucial information as the individual mount
+ * structures are pretty independent, but it helps the XQM keep a
+ * global view of what's going on.
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_hold_quotafs_ref(
+	struct xfs_mount *mp)
+{
+	/*
+	 * Need to lock the xfs_Gqm structure for things like this. For example,
+	 * the structure could disappear between the entry to this routine and
+	 * a HOLD operation if not locked.
+	 */
+	XFS_QM_LOCK(xfs_Gqm);
+
+	if (xfs_Gqm == NULL) {
+		if ((xfs_Gqm = xfs_qm_init()) == NULL) {
+			return (XFS_ERROR(EINVAL));
+		}
+	}
+	/*
+	 * We can keep a list of all filesystems with quotas mounted for
+	 * debugging and statistical purposes, but ...
+	 * Just take a reference and get out.
+	 */
+	XFS_QM_HOLD(xfs_Gqm);
+	XFS_QM_UNLOCK(xfs_Gqm);
+
+	return 0;
+}
+
+
+/*
+ * Release the reference that a filesystem took at mount time,
+ * so that we know when we need to destroy the entire quota manager.
+ */
+/* ARGSUSED */
+STATIC void
+xfs_qm_rele_quotafs_ref(
+	struct xfs_mount *mp)
+{
+	xfs_dquot_t	*dqp, *nextdqp;
+
+	ASSERT(xfs_Gqm);
+	ASSERT(xfs_Gqm->qm_nrefs > 0);
+
+	/*
+	 * Go thru the freelist and destroy all inactive dquots.
+	 */
+	xfs_qm_freelist_lock(xfs_Gqm);
+
+	for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
+	     dqp != (xfs_dquot_t *)&(xfs_Gqm->qm_dqfreelist); ) {
+		xfs_dqlock(dqp);
+		nextdqp = dqp->dq_flnext;
+		if (dqp->dq_flags & XFS_DQ_INACTIVE) {
+			ASSERT(dqp->q_mount == NULL);
+			ASSERT(! XFS_DQ_IS_DIRTY(dqp));
+			ASSERT(dqp->HL_PREVP == NULL);
+			ASSERT(dqp->MPL_PREVP == NULL);
+			XQM_FREELIST_REMOVE(dqp);
+			xfs_dqunlock(dqp);
+			xfs_qm_dqdestroy(dqp);
+		} else {
+			xfs_dqunlock(dqp);
+		}
+		dqp = nextdqp;
+	}
+	xfs_qm_freelist_unlock(xfs_Gqm);
+
+	/*
+	 * Destroy the entire XQM. If somebody mounts with quotaon, this'll
+	 * be restarted.
+	 */
+	XFS_QM_LOCK(xfs_Gqm);
+	XFS_QM_RELE(xfs_Gqm);
+	if (xfs_Gqm->qm_nrefs == 0) {
+		xfs_qm_destroy(xfs_Gqm);
+		xfs_Gqm = NULL;
+	}
+	XFS_QM_UNLOCK(xfs_Gqm);
+}
+
+/*
+ * This is called at mount time from xfs_mountfs to initialize the quotainfo
+ * structure and start the global quotamanager (xfs_Gqm) if it hasn't done
+ * so already.	Note that the superblock has not been read in yet.
+ */
+void
+xfs_qm_mount_quotainit(
+	xfs_mount_t	*mp,
+	uint		flags)
+{
+	/*
+	 * User or group quotas has to be on.
+	 */
+	ASSERT(flags & (XFSMNT_UQUOTA | XFSMNT_GQUOTA));
+
+	/*
+	 * Initialize the flags in the mount structure. From this point
+	 * onwards we look at m_qflags to figure out if quotas's ON/OFF, etc.
+	 * Note that we enforce nothing if accounting is off.
+	 * ie.	XFSMNT_*QUOTA must be ON for XFSMNT_*QUOTAENF.
+	 * It isn't necessary to take the quotaoff lock to do this; this is
+	 * called from mount.
+	 */
+	if (flags & XFSMNT_UQUOTA) {
+		mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
+		if (flags & XFSMNT_UQUOTAENF)
+			mp->m_qflags |= XFS_UQUOTA_ENFD;
+	}
+	if (flags & XFSMNT_GQUOTA) {
+		mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
+		if (flags & XFSMNT_GQUOTAENF)
+			mp->m_qflags |= XFS_GQUOTA_ENFD;
+	}
+}
+
+/*
+ * Just destroy the quotainfo structure.
+ */
+void
+xfs_qm_unmount_quotadestroy(
+	xfs_mount_t	*mp)
+{
+	xfs_qm_destroy_quotainfo(mp);
+}
+
+
+/*
+ * This is called from xfs_mountfs to start quotas and initialize all
+ * necessary data structures like quotainfo, and in the rootfs's case
+ * xfs_Gqm. This is also responsible for running a quotacheck as necessary.
+ * We are guaranteed that the superblock is consistently read in at this
+ * point.
+ */
+int
+xfs_qm_mount_quotas(
+	xfs_mount_t	*mp)
+{
+	unsigned long		s;
+	int		error;
+	uint		sbf;
+
+	error = 0;
+	/*
+	 * If a non-root file system had quotas running earlier, but decided
+	 * to mount without -o quota/pquota options, revoke the quotachecked
+	 * license, and bail out.
+	 */
+	if (! XFS_IS_QUOTA_ON(mp) &&
+	    (mp->m_dev != rootdev) &&
+	    (mp->m_sb.sb_qflags & (XFS_UQUOTA_ACCT|XFS_GQUOTA_ACCT))) {
+		mp->m_qflags = 0;
+		goto write_changes;
+	}
+
+	/*
+	 * If quotas on realtime volumes is not supported, we disable
+	 * quotas immediately.
+	 */
+	if (mp->m_sb.sb_rextents) {
+		cmn_err(CE_NOTE,
+			"Cannot turn on quotas for realtime filesystem %s",
+			mp->m_fsname);
+		mp->m_qflags = 0;
+		goto write_changes;
+	}
+
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+	cmn_err(CE_NOTE, "Attempting to turn on disk quotas.");
+#endif
+	/*
+	 * If this is the root file system, mark flags in mount struct first.
+	 * We couldn't do this earlier because we didn't have the superblock
+	 * read in.
+	 */
+	if (mp->m_dev == rootdev) {
+		ASSERT(XFS_SB_VERSION_HASQUOTA(&mp->m_sb));
+		ASSERT(mp->m_sb.sb_qflags &
+		       (XFS_UQUOTA_ACCT|XFS_GQUOTA_ACCT));
+		if (xfs_Gqm == NULL) {
+			if ((xfs_Gqm = xfs_qm_init()) == NULL) {
+				mp->m_qflags = 0;
+				error = EINVAL;
+				goto write_changes;
+			}
+		}
+		mp->m_qflags = mp->m_sb.sb_qflags;
+		if (mp->m_qflags & XFS_UQUOTA_ACCT)
+			mp->m_qflags |= XFS_UQUOTA_ACTIVE;
+		if (mp->m_qflags & XFS_GQUOTA_ACCT)
+			mp->m_qflags |= XFS_GQUOTA_ACTIVE;
+		/*
+		 * The quotainode of the root file system may or may not
+		 * exist at this point.
+		 */
+	}
+
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+	/*
+	 * Allocate the quotainfo structure inside the mount struct, and
+	 * create quotainode(s), and change/rev superblock if necessary.
+	 */
+	if ((error = xfs_qm_init_quotainfo(mp))) {
+		/*
+		 * We must turn off quotas.
+		 */
+		ASSERT(mp->m_quotainfo == NULL);
+		mp->m_qflags = 0;
+		goto write_changes;
+	}
+	/*
+	 * If any of the quotas are not consistent, do a quotacheck.
+	 */
+	if (XFS_QM_NEED_QUOTACHECK(mp)) {
+#ifdef DEBUG
+		cmn_err(CE_NOTE, "Doing a quotacheck. Please wait.");
+#endif
+		if ((error = xfs_qm_quotacheck(mp))) {
+			cmn_err(CE_WARN, "Quotacheck unsuccessful (Error %d): "
+				"Disabling quotas.",
+				error);
+			/*
+			 * We must turn off quotas.
+			 */
+			ASSERT(mp->m_quotainfo != NULL);
+			ASSERT(xfs_Gqm != NULL);
+			xfs_qm_destroy_quotainfo(mp);
+			mp->m_qflags = 0;
+			goto write_changes;
+		}
+#ifdef DEBUG
+		cmn_err(CE_NOTE, "Done quotacheck.");
+#endif
+	}
+ write_changes:
+	/*
+	 * We actually don't have to acquire the SB_LOCK at all.
+	 * This can only be called from mount, and that's single threaded. XXX
+	 */
+	s = XFS_SB_LOCK(mp);
+	sbf = mp->m_sb.sb_qflags;
+	mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL;
+	XFS_SB_UNLOCK(mp, s);
+
+	if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
+		if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) {
+			/*
+			 * We could only have been turning quotas off.
+			 * We aren't in very good shape actually because
+			 * the incore structures are convinced that quotas are
+			 * off, but the on disk superblock doesn't know that !
+			 */
+			ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
+			xfs_fs_cmn_err(CE_ALERT, mp,
+				"XFS mount_quotas: Superblock update failed!");
+		}
+	}
+
+	if (error) {
+		xfs_fs_cmn_err(CE_WARN, mp,
+			"Failed to initialize disk quotas.");
+	}
+	return XFS_ERROR(error);
+}
+
+/*
+ * Called from the vfsops layer.
+ */
+int
+xfs_qm_unmount_quotas(
+	xfs_mount_t	*mp)
+{
+	xfs_inode_t	*uqp, *gqp;
+	int		error;
+
+	error = 0;
+
+	/*
+	 * Release the dquots that root inode, et al might be holding,
+	 * before we flush quotas and blow away the quotainfo structure.
+	 */
+	ASSERT(mp->m_rootip);
+	if (mp->m_rootip->i_udquot || mp->m_rootip->i_gdquot)
+		xfs_qm_dqdettach_inode(mp->m_rootip);
+	if (mp->m_rbmip &&
+	    (mp->m_rbmip->i_udquot || mp->m_rbmip->i_gdquot))
+		xfs_qm_dqdettach_inode(mp->m_rbmip);
+	if (mp->m_rsumip &&
+	    (mp->m_rsumip->i_udquot || mp->m_rsumip->i_gdquot))
+		xfs_qm_dqdettach_inode(mp->m_rsumip);
+
+	/*
+	 * Flush out the quota inodes.
+	 */
+	uqp = gqp = NULL;
+	if (mp->m_quotainfo) {
+		if ((uqp = mp->m_quotainfo->qi_uquotaip) != NULL) {
+			xfs_ilock(uqp, XFS_ILOCK_EXCL);
+			xfs_iflock(uqp);
+			error = xfs_iflush(uqp, XFS_IFLUSH_SYNC);
+			xfs_iunlock(uqp, XFS_ILOCK_EXCL);
+			if (error == EFSCORRUPTED)
+				goto out;
+		}
+		if ((gqp = mp->m_quotainfo->qi_gquotaip) != NULL) {
+			xfs_ilock(gqp, XFS_ILOCK_EXCL);
+			xfs_iflock(gqp);
+			error = xfs_iflush(gqp, XFS_IFLUSH_SYNC);
+			xfs_iunlock(gqp, XFS_ILOCK_EXCL);
+			if (error == EFSCORRUPTED)
+				goto out;
+		}
+	}
+	if (uqp) {
+		 XFS_PURGE_INODE(uqp);
+		 mp->m_quotainfo->qi_uquotaip = NULL;
+	}
+	if (gqp) {
+		XFS_PURGE_INODE(gqp);
+		mp->m_quotainfo->qi_gquotaip = NULL;
+	}
+out:
+	return XFS_ERROR(error);
+}
+
+/*
+ * Flush all dquots of the given file system to disk. The dquots are
+ * _not_ purged from memory here, just their data written to disk.
+ */
+int
+xfs_qm_dqflush_all(
+	xfs_mount_t	*mp,
+	int		flags)
+{
+	int		recl;
+	xfs_dquot_t	*dqp;
+	int		niters;
+	int		error;
+
+	if (mp->m_quotainfo == NULL)
+		return (0);
+	niters = 0;
+again:
+	xfs_qm_mplist_lock(mp);
+	FOREACH_DQUOT_IN_MP(dqp, mp) {
+		xfs_dqlock(dqp);
+		if (! XFS_DQ_IS_DIRTY(dqp)) {
+			xfs_dqunlock(dqp);
+			continue;
+		}
+		xfs_dqtrace_entry(dqp, "FLUSHALL: DQDIRTY");
+		/* XXX a sentinel would be better */
+		recl = XFS_QI_MPLRECLAIMS(mp);
+		if (! xfs_qm_dqflock_nowait(dqp)) {
+			/*
+			 * If we can't grab the flush lock then check
+			 * to see if the dquot has been flushed delayed
+			 * write.  If so, grab its buffer and send it
+			 * out immediately.  We'll be able to acquire
+			 * the flush lock when the I/O completes.
+			 */
+			xfs_qm_dqflock_pushbuf_wait(dqp);
+		}
+		/*
+		 * Let go of the mplist lock. We don't want to hold it
+		 * across a disk write.
+		 */
+		xfs_qm_mplist_unlock(mp);
+		error = xfs_qm_dqflush(dqp, flags);
+		xfs_dqunlock(dqp);
+		if (error)
+			return (error);
+
+		/*
+		 * If this is the root filesystem doing a quotacheck,
+		 * we should do periodic bflushes. This is because there's
+		 * no bflushd at this point.
+		 */
+		if (mp->m_flags & XFS_MOUNT_ROOTQCHECK) {
+			if (++niters == XFS_QM_MAX_DQCLUSTER_LOGSZ) {
+				xfs_log_force(mp, (xfs_lsn_t)0,
+					      XFS_LOG_FORCE | XFS_LOG_SYNC);
+				XFS_bflush(mp->m_ddev_targp);
+				niters = 0;
+			}
+		}
+
+		xfs_qm_mplist_lock(mp);
+		if (recl != XFS_QI_MPLRECLAIMS(mp)) {
+			xfs_qm_mplist_unlock(mp);
+			/* XXX restart limit */
+			goto again;
+		}
+	}
+
+	xfs_qm_mplist_unlock(mp);
+	/* return ! busy */
+	return (0);
+}
+/*
+ * Release the group dquot pointers the user dquots may be
+ * carrying around as a hint. mplist is locked on entry and exit.
+ */
+STATIC void
+xfs_qm_detach_gdquots(
+	xfs_mount_t	*mp)
+{
+	xfs_dquot_t	*dqp, *gdqp;
+	int		nrecl;
+
+ again:
+	ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+	dqp = XFS_QI_MPLNEXT(mp);
+	while (dqp) {
+		xfs_dqlock(dqp);
+		if ((gdqp = dqp->q_gdquot)) {
+			xfs_dqlock(gdqp);
+			dqp->q_gdquot = NULL;
+		}
+		xfs_dqunlock(dqp);
+
+		if (gdqp) {
+			/*
+			 * Can't hold the mplist lock across a dqput.
+			 * XXXmust convert to marker based iterations here.
+			 */
+			nrecl = XFS_QI_MPLRECLAIMS(mp);
+			xfs_qm_mplist_unlock(mp);
+			xfs_qm_dqput(gdqp);
+
+			xfs_qm_mplist_lock(mp);
+			if (nrecl != XFS_QI_MPLRECLAIMS(mp))
+				goto again;
+		}
+		dqp = dqp->MPL_NEXT;
+	}
+}
+
+/*
+ * Go through all the incore dquots of this file system and take them
+ * off the mplist and hashlist, if the dquot type matches the dqtype
+ * parameter. This is used when turning off quota accounting for
+ * users and/or groups, as well as when the filesystem is unmounting.
+ */
+int
+xfs_qm_dqpurge_all(
+		   xfs_mount_t	*mp,
+		   uint		flags) /* QUOTAOFF/UMOUNTING/UQUOTA/GQUOTA */
+{
+	xfs_dquot_t	*dqp;
+	uint		dqtype;
+	int		nrecl;
+	xfs_dquot_t	*nextdqp;
+	int		nmisses;
+
+	if (mp->m_quotainfo == NULL)
+		return (0);
+
+	dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
+	dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
+
+	xfs_qm_mplist_lock(mp);
+
+	/*
+	 * In the first pass through all incore dquots of this filesystem,
+	 * we release the group dquot pointers the user dquots may be
+	 * carrying around as a hint. We need to do this irrespective of
+	 * what's being turned off.
+	 */
+	xfs_qm_detach_gdquots(mp);
+
+      again:
+	nmisses = 0;
+	ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
+	/*
+	 * Try to get rid of all of the unwanted dquots. The idea is to
+	 * get them off mplist and hashlist, but leave them on freelist.
+	 */
+	dqp = XFS_QI_MPLNEXT(mp);
+	while (dqp) {
+		/*
+		 * It's OK to look at the type without taking dqlock here.
+		 * We're holding the mplist lock here, and that's needed for
+		 * a dqreclaim.
+		 */
+		if ((dqp->dq_flags & dqtype) == 0) {
+			dqp = dqp->MPL_NEXT;
+			continue;
+		}
+
+		if (! xfs_qm_dqhashlock_nowait(dqp)) {
+			nrecl = XFS_QI_MPLRECLAIMS(mp);
+			xfs_qm_mplist_unlock(mp);
+			XFS_DQ_HASH_LOCK(dqp->q_hash);
+			xfs_qm_mplist_lock(mp);
+
+			/*
+			 * XXXTheoretically, we can get into a very long
+			 * ping pong game here.
+			 * No one can be adding dquots to the mplist at
+			 * this point, but somebody might be taking things off.
+			 */
+			if (nrecl != XFS_QI_MPLRECLAIMS(mp)) {
+				XFS_DQ_HASH_UNLOCK(dqp->q_hash);
+				goto again;
+			}
+		}
+
+		/*
+		 * Take the dquot off the mplist and hashlist. It may remain on
+		 * freelist in INACTIVE state.
+		 */
+		nextdqp = dqp->MPL_NEXT;
+		nmisses += xfs_qm_dqpurge(dqp, flags);
+		dqp = nextdqp;
+	}
+	xfs_qm_mplist_unlock(mp);
+	return (nmisses);
+}
+
+STATIC int
+xfs_qm_dqattach_one(
+	xfs_inode_t	*ip,
+	xfs_dqid_t	id,
+	uint		type,
+	uint		doalloc,
+	uint		dolock,
+	xfs_dquot_t	*udqhint, /* hint */
+	xfs_dquot_t	**IO_idqpp)
+{
+	xfs_dquot_t	*dqp;
+	int		error;
+
+	ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+	error = 0;
+	/*
+	 * See if we already have it in the inode itself. IO_idqpp is
+	 * &i_udquot or &i_gdquot. This made the code look weird, but
+	 * made the logic a lot simpler.
+	 */
+	if ((dqp = *IO_idqpp)) {
+		if (dolock)
+			xfs_dqlock(dqp);
+		xfs_dqtrace_entry(dqp, "DQATTACH: found in ip");
+		goto done;
+	}
+
+	/*
+	 * udqhint is the i_udquot field in inode, and is non-NULL only
+	 * when the type arg is XFS_DQ_GROUP. Its purpose is to save a
+	 * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside
+	 * the user dquot.
+	 */
+	ASSERT(!udqhint || type == XFS_DQ_GROUP);
+	if (udqhint && !dolock)
+		xfs_dqlock(udqhint);
+
+	/*
+	 * No need to take dqlock to look at the id.
+	 * The ID can't change until it gets reclaimed, and it won't
+	 * be reclaimed as long as we have a ref from inode and we hold
+	 * the ilock.
+	 */
+	if (udqhint &&
+	    (dqp = udqhint->q_gdquot) &&
+	    (INT_GET(dqp->q_core.d_id, ARCH_CONVERT) == id)) {
+		ASSERT(XFS_DQ_IS_LOCKED(udqhint));
+		xfs_dqlock(dqp);
+		XFS_DQHOLD(dqp);
+		ASSERT(*IO_idqpp == NULL);
+		*IO_idqpp = dqp;
+		if (!dolock) {
+			xfs_dqunlock(dqp);
+			xfs_dqunlock(udqhint);
+		}
+		/* XXX XFS_STATS */
+		goto done;
+	}
+	/*
+	 * We can't hold a dquot lock when we call the dqget code.
+	 * We'll deadlock in no time, because of (not conforming to)
+	 * lock ordering - the inodelock comes before any dquot lock,
+	 * and we may drop and reacquire the ilock in xfs_qm_dqget().
+	 */
+	if (udqhint)
+		xfs_dqunlock(udqhint);
+	/*
+	 * Find the dquot from somewhere. This bumps the
+	 * reference count of dquot and returns it locked.
+	 * This can return ENOENT if dquot didn't exist on
+	 * disk and we didn't ask it to allocate;
+	 * ESRCH if quotas got turned off suddenly.
+	 */
+	if ((error = xfs_qm_dqget(ip->i_mount, ip, id, type,
+				 doalloc|XFS_QMOPT_DOWARN, &dqp))) {
+		if (udqhint && dolock)
+			xfs_dqlock(udqhint);
+		goto done;
+	}
+
+	xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget");
+	/*
+	 * dqget may have dropped and re-acquired the ilock, but it guarantees
+	 * that the dquot returned is the one that should go in the inode.
+	 */
+	*IO_idqpp = dqp;
+	ASSERT(dqp);
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	if (! dolock) {
+		xfs_dqunlock(dqp);
+		ASSERT(!udqhint || !XFS_DQ_IS_LOCKED(udqhint));
+		goto done;
+	}
+	if (! udqhint)
+		goto done;
+
+	ASSERT(udqhint);
+	ASSERT(dolock);
+	ASSERT(! XFS_DQ_IS_LOCKED(udqhint));
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	if (! xfs_qm_dqlock_nowait(udqhint)) {
+		xfs_dqunlock(dqp);
+		xfs_dqlock(udqhint);
+		xfs_dqlock(dqp);
+	}
+      done:
+#ifdef QUOTADEBUG
+	if (udqhint) {
+		if (dolock)
+			ASSERT(XFS_DQ_IS_LOCKED(udqhint));
+		else
+			ASSERT(! XFS_DQ_IS_LOCKED(udqhint));
+	}
+	if (! error) {
+		if (dolock)
+			ASSERT(XFS_DQ_IS_LOCKED(dqp));
+		else
+			ASSERT(! XFS_DQ_IS_LOCKED(dqp));
+	}
+#endif
+	return (error);
+}
+
+
+/*
+ * Given a udquot and gdquot, attach a ptr to the group dquot in the
+ * udquot as a hint for future lookups. The idea sounds simple, but the
+ * execution isn't, because the udquot might have a group dquot attached
+ * already and getting rid of that gets us into lock ordering contraints.
+ * The process is complicated more by the fact that the dquots may or may not
+ * be locked on entry.
+ */
+STATIC void
+xfs_qm_dqattach_grouphint(
+	xfs_dquot_t	*udq,
+	xfs_dquot_t	*gdq,
+	uint		locked)
+{
+	xfs_dquot_t	*tmp;
+
+#ifdef QUOTADEBUG
+	if (locked) {
+		ASSERT(XFS_DQ_IS_LOCKED(udq));
+		ASSERT(XFS_DQ_IS_LOCKED(gdq));
+	} else {
+		ASSERT(! XFS_DQ_IS_LOCKED(udq));
+		ASSERT(! XFS_DQ_IS_LOCKED(gdq));
+	}
+#endif
+	if (! locked)
+		xfs_dqlock(udq);
+
+	if ((tmp = udq->q_gdquot)) {
+		if (tmp == gdq) {
+			if (! locked)
+				xfs_dqunlock(udq);
+			return;
+		}
+
+		udq->q_gdquot = NULL;
+		/*
+		 * We can't keep any dqlocks when calling dqrele,
+		 * because the freelist lock comes before dqlocks.
+		 */
+		xfs_dqunlock(udq);
+		if (locked)
+			xfs_dqunlock(gdq);
+		/*
+		 * we took a hard reference once upon a time in dqget,
+		 * so give it back when the udquot no longer points at it
+		 * dqput() does the unlocking of the dquot.
+		 */
+		xfs_qm_dqrele(tmp);
+
+		ASSERT(! XFS_DQ_IS_LOCKED(udq));
+		ASSERT(! XFS_DQ_IS_LOCKED(gdq));
+		xfs_dqlock(udq);
+		xfs_dqlock(gdq);
+
+	} else {
+		ASSERT(XFS_DQ_IS_LOCKED(udq));
+		if (! locked) {
+			ASSERT(! XFS_DQ_IS_LOCKED(gdq));
+			xfs_dqlock(gdq);
+		}
+	}
+
+	ASSERT(XFS_DQ_IS_LOCKED(udq));
+	ASSERT(XFS_DQ_IS_LOCKED(gdq));
+	/*
+	 * Somebody could have attached a gdquot here,
+	 * when we dropped the uqlock. If so, just do nothing.
+	 */
+	if (udq->q_gdquot == NULL) {
+		XFS_DQHOLD(gdq);
+		udq->q_gdquot = gdq;
+	}
+	if (! locked) {
+		xfs_dqunlock(gdq);
+		xfs_dqunlock(udq);
+	}
+}
+
+
+/*
+ * Given a locked inode, attach dquot(s) to it, taking UQUOTAON / GQUOTAON
+ * in to account.
+ * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
+ * If XFS_QMOPT_DQLOCK, the dquot(s) will be returned locked. This option pretty
+ * much made this code a complete mess, but it has been pretty useful.
+ * If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL.
+ * Inode may get unlocked and relocked in here, and the caller must deal with
+ * the consequences.
+ */
+int
+xfs_qm_dqattach(
+		xfs_inode_t	*ip,
+		uint		flags)
+{
+	int		error;
+	xfs_mount_t	*mp;
+	uint		nquotas;
+
+	mp = ip->i_mount;
+	ASSERT(ip->i_ino != mp->m_sb.sb_uquotino &&
+	       ip->i_ino != mp->m_sb.sb_gquotino);
+
+	ASSERT((flags & XFS_QMOPT_ILOCKED) == 0 ||
+	       XFS_ISLOCKED_INODE_EXCL(ip));
+
+	nquotas = 0;
+	error = 0;
+	if (! (flags & XFS_QMOPT_ILOCKED))
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	if (XFS_IS_UQUOTA_ON(mp)) {
+		if ((error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
+						flags & XFS_QMOPT_DQALLOC,
+						flags & XFS_QMOPT_DQLOCK,
+						NULL, &ip->i_udquot)))
+			goto done;
+		nquotas++;
+	}
+	ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+	if (XFS_IS_GQUOTA_ON(mp)) {
+		if ((error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
+						flags & XFS_QMOPT_DQALLOC,
+						flags & XFS_QMOPT_DQLOCK,
+						ip->i_udquot, &ip->i_gdquot)))
+			/*
+			 * Don't worry about the udquot that we may have
+			 * attached above. It'll get dettached, if not already.
+			 */
+			goto done;
+		nquotas++;
+	}
+
+	/*
+	 * Attach this group quota to the user quota as a hint.
+	 * This WON'T, in general, result in a thrash.
+	 */
+	if (nquotas == 2) {
+		ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+		ASSERT(ip->i_udquot);
+		ASSERT(ip->i_gdquot);
+
+		/*
+		 * We may or may not have the i_udquot locked at this point,
+		 * but this check is OK since we don't depend on the i_gdquot to
+		 * be accurate 100% all the time. It is just a hint, and this
+		 * will succeed in general.
+		 */
+		if (ip->i_udquot->q_gdquot == ip->i_gdquot)
+			goto done;
+		/*
+		 * Attach i_gdquot to the gdquot hint inside the i_udquot.
+		 */
+		xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot,
+					 flags & XFS_QMOPT_DQLOCK);
+	}
+
+      done:
+
+#ifdef QUOTADEBUG
+	if (! error) {
+		if (ip->i_udquot) {
+			if (flags & XFS_QMOPT_DQLOCK)
+				ASSERT(XFS_DQ_IS_LOCKED(ip->i_udquot));
+			else
+				ASSERT(! XFS_DQ_IS_LOCKED(ip->i_udquot));
+		}
+		if (ip->i_gdquot) {
+			if (flags & XFS_QMOPT_DQLOCK)
+				ASSERT(XFS_DQ_IS_LOCKED(ip->i_gdquot));
+			else
+				ASSERT(! XFS_DQ_IS_LOCKED(ip->i_gdquot));
+		}
+		if (XFS_IS_UQUOTA_ON(mp))
+			ASSERT(ip->i_udquot);
+		if (XFS_IS_GQUOTA_ON(mp))
+			ASSERT(ip->i_gdquot);
+	}
+#endif
+
+	if (! (flags & XFS_QMOPT_ILOCKED))
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+#ifdef QUOTADEBUG
+	else
+		ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+#endif
+	return (error);
+}
+
+/*
+ * Release dquots (and their references) if any.
+ * The inode should be locked EXCL except when this's called by
+ * xfs_ireclaim.
+ */
+void
+xfs_qm_dqdettach_inode(
+	xfs_inode_t	*ip)
+{
+	ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino);
+	ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino);
+	if (ip->i_udquot)
+		xfs_dqtrace_entry_ino(ip->i_udquot, "DQDETTACH", ip);
+	if (ip->i_udquot) {
+		xfs_qm_dqrele(ip->i_udquot);
+		ip->i_udquot = NULL;
+	}
+	if (ip->i_gdquot) {
+		xfs_qm_dqrele(ip->i_gdquot);
+		ip->i_gdquot = NULL;
+	}
+}
+
+int
+xfs_qm_unmount(
+	xfs_mount_t	*mp)
+{
+	vnode_t		*vp;
+
+	if (XFS_IS_UQUOTA_ON(mp)) {
+		vp = XFS_ITOV(XFS_QI_UQIP(mp));
+		VN_RELE(vp);
+		if (vn_count(vp) > 1)
+			cmn_err(CE_WARN, "UQUOTA busy vp=0x%x count=%d\n",
+				vp, vn_count(vp));
+	}
+	if (XFS_IS_GQUOTA_ON(mp)) {
+		vp = XFS_ITOV(XFS_QI_GQIP(mp));
+		VN_RELE(vp);
+		if (vn_count(vp) > 1)
+			cmn_err(CE_WARN, "GQUOTA busy vp=0x%x count=%d\n",
+				vp, vn_count(vp));
+	}
+
+	return (0);
+}
+
+
+/*
+ * This is called by xfs_sync and flags arg determines the caller,
+ * and its motives, as done in xfs_sync.
+ *
+ * vfs_sync: SYNC_FSDATA|SYNC_ATTR|SYNC_BDFLUSH 0x31
+ * syscall sync: SYNC_FSDATA|SYNC_ATTR|SYNC_DELWRI 0x25
+ * umountroot : SYNC_WAIT | SYNC_CLOSE | SYNC_ATTR | SYNC_FSDATA
+ */
+
+int
+xfs_qm_sync(
+	xfs_mount_t	*mp,
+	short		flags)
+{
+	int		recl, restarts;
+	xfs_dquot_t	*dqp;
+	uint		flush_flags;
+	boolean_t	nowait;
+	int		error;
+
+	restarts = 0;
+	/*
+	 * We won't block unless we are asked to.
+	 */
+	nowait = (boolean_t)(flags & SYNC_BDFLUSH || (flags & SYNC_WAIT) == 0);
+
+  again:
+	xfs_qm_mplist_lock(mp);
+	/*
+	 * dqpurge_all() also takes the mplist lock and iterate thru all dquots
+	 * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared
+	 * when we have the mplist lock, we know that dquots will be consistent
+	 * as long as we have it locked.
+	 */
+	if (! XFS_IS_QUOTA_ON(mp)) {
+		xfs_qm_mplist_unlock(mp);
+		return (0);
+	}
+	FOREACH_DQUOT_IN_MP(dqp, mp) {
+		/*
+		 * If this is vfs_sync calling, then skip the dquots that
+		 * don't 'seem' to be dirty. ie. don't acquire dqlock.
+		 * This is very similar to what xfs_sync does with inodes.
+		 */
+		if (flags & SYNC_BDFLUSH) {
+			if (! XFS_DQ_IS_DIRTY(dqp))
+				continue;
+		}
+
+		if (nowait) {
+			/*
+			 * Try to acquire the dquot lock. We are NOT out of
+			 * lock order, but we just don't want to wait for this
+			 * lock, unless somebody wanted us to.
+			 */
+			if (! xfs_qm_dqlock_nowait(dqp))
+				continue;
+		} else {
+			xfs_dqlock(dqp);
+		}
+
+		/*
+		 * Now, find out for sure if this dquot is dirty or not.
+		 */
+		if (! XFS_DQ_IS_DIRTY(dqp)) {
+			xfs_dqunlock(dqp);
+			continue;
+		}
+
+		/* XXX a sentinel would be better */
+		recl = XFS_QI_MPLRECLAIMS(mp);
+		if (! xfs_qm_dqflock_nowait(dqp)) {
+			if (nowait) {
+				xfs_dqunlock(dqp);
+				continue;
+			}
+			/*
+			 * If we can't grab the flush lock then if the caller
+			 * really wanted us to give this our best shot,
+			 * see if we can give a push to the buffer before we wait
+			 * on the flush lock. At this point, we know that
+			 * eventhough the dquot is being flushed,
+			 * it has (new) dirty data.
+			 */
+			xfs_qm_dqflock_pushbuf_wait(dqp);
+		}
+		/*
+		 * Let go of the mplist lock. We don't want to hold it
+		 * across a disk write
+		 */
+		flush_flags = (nowait) ? XFS_QMOPT_DELWRI : XFS_QMOPT_SYNC;
+		xfs_qm_mplist_unlock(mp);
+		xfs_dqtrace_entry(dqp, "XQM_SYNC: DQFLUSH");
+		error = xfs_qm_dqflush(dqp, flush_flags);
+		xfs_dqunlock(dqp);
+		if (error && XFS_FORCED_SHUTDOWN(mp))
+			return(0);	/* Need to prevent umount failure */
+		else if (error)
+			return (error);
+
+		xfs_qm_mplist_lock(mp);
+		if (recl != XFS_QI_MPLRECLAIMS(mp)) {
+			if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS)
+				break;
+
+			xfs_qm_mplist_unlock(mp);
+			goto again;
+		}
+	}
+
+	xfs_qm_mplist_unlock(mp);
+	return (0);
+}
+
+
+/*
+ * This initializes all the quota information that's kept in the
+ * mount structure
+ */
+int
+xfs_qm_init_quotainfo(
+	xfs_mount_t	*mp)
+{
+	xfs_quotainfo_t *qinf;
+	int		error;
+	xfs_dquot_t	*dqp;
+
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	/*
+	 * Tell XQM that we exist as soon as possible.
+	 */
+	if ((error = xfs_qm_hold_quotafs_ref(mp))) {
+		return (error);
+	}
+
+	qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
+
+	/*
+	 * See if quotainodes are setup, and if not, allocate them,
+	 * and change the superblock accordingly.
+	 */
+	if ((error = xfs_qm_init_quotainos(mp))) {
+		kmem_free(qinf, sizeof(xfs_quotainfo_t));
+		mp->m_quotainfo = NULL;
+		return (error);
+	}
+
+	spinlock_init(&qinf->qi_pinlock, "xfs_qinf_pin");
+	xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
+	qinf->qi_dqreclaims = 0;
+
+	/* mutex used to serialize quotaoffs */
+	mutex_init(&qinf->qi_quotaofflock, MUTEX_DEFAULT, "qoff");
+
+	/* Precalc some constants */
+	qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
+	ASSERT(qinf->qi_dqchunklen);
+	qinf->qi_dqperchunk = BBTOB(qinf->qi_dqchunklen);
+	do_div(qinf->qi_dqperchunk, sizeof(xfs_dqblk_t));
+
+	mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
+
+	/*
+	 * We try to get the limits from the superuser's limits fields.
+	 * This is quite hacky, but it is standard quota practice.
+	 * We look at the USR dquot with id == 0 first, but if user quotas
+	 * are not enabled we goto the GRP dquot with id == 0.
+	 * We don't really care to keep separate default limits for user
+	 * and group quotas, at least not at this point.
+	 */
+	error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)0,
+			     (XFS_IS_UQUOTA_RUNNING(mp)) ?
+			     XFS_DQ_USER : XFS_DQ_GROUP,
+			     XFS_QMOPT_DQSUSER|XFS_QMOPT_DOWARN,
+			     &dqp);
+	if (! error) {
+		/*
+		 * The warnings and timers set the grace period given to
+		 * a user or group before he or she can not perform any
+		 * more writing. If it is zero, a default is used.
+		 */
+		qinf->qi_btimelimit = INT_GET(dqp->q_core.d_btimer, ARCH_CONVERT) ?
+			INT_GET(dqp->q_core.d_btimer, ARCH_CONVERT) : XFS_QM_BTIMELIMIT;
+		qinf->qi_itimelimit = INT_GET(dqp->q_core.d_itimer, ARCH_CONVERT) ?
+			INT_GET(dqp->q_core.d_itimer, ARCH_CONVERT) : XFS_QM_ITIMELIMIT;
+		qinf->qi_rtbtimelimit = INT_GET(dqp->q_core.d_rtbtimer, ARCH_CONVERT) ?
+			INT_GET(dqp->q_core.d_rtbtimer, ARCH_CONVERT) : XFS_QM_RTBTIMELIMIT;
+		qinf->qi_bwarnlimit = INT_GET(dqp->q_core.d_bwarns, ARCH_CONVERT) ?
+			INT_GET(dqp->q_core.d_bwarns, ARCH_CONVERT) : XFS_QM_BWARNLIMIT;
+		qinf->qi_iwarnlimit = INT_GET(dqp->q_core.d_iwarns, ARCH_CONVERT) ?
+			INT_GET(dqp->q_core.d_iwarns, ARCH_CONVERT) : XFS_QM_IWARNLIMIT;
+
+		/*
+		 * We sent the XFS_QMOPT_DQSUSER flag to dqget because
+		 * we don't want this dquot cached. We haven't done a
+		 * quotacheck yet, and quotacheck doesn't like incore dquots.
+		 */
+		xfs_qm_dqdestroy(dqp);
+	} else {
+		qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
+		qinf->qi_itimelimit = XFS_QM_ITIMELIMIT;
+		qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT;
+		qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT;
+		qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT;
+	}
+
+	return (0);
+}
+
+
+/*
+ * Gets called when unmounting a filesystem or when all quotas get
+ * turned off.
+ * This purges the quota inodes, destroys locks and frees itself.
+ */
+void
+xfs_qm_destroy_quotainfo(
+	xfs_mount_t	*mp)
+{
+	xfs_quotainfo_t *qi;
+
+	qi = mp->m_quotainfo;
+	ASSERT(qi != NULL);
+	ASSERT(xfs_Gqm != NULL);
+
+	/*
+	 * Release the reference that XQM kept, so that we know
+	 * when the XQM structure should be freed. We cannot assume
+	 * that xfs_Gqm is non-null after this point.
+	 */
+	xfs_qm_rele_quotafs_ref(mp);
+
+	spinlock_destroy(&qi->qi_pinlock);
+	xfs_qm_list_destroy(&qi->qi_dqlist);
+
+	if (qi->qi_uquotaip) {
+		XFS_PURGE_INODE(qi->qi_uquotaip);
+		qi->qi_uquotaip = NULL; /* paranoia */
+	}
+	if (qi->qi_gquotaip) {
+		XFS_PURGE_INODE(qi->qi_gquotaip);
+		qi->qi_gquotaip = NULL;
+	}
+	mutex_destroy(&qi->qi_quotaofflock);
+	kmem_free(qi, sizeof(xfs_quotainfo_t));
+	mp->m_quotainfo = NULL;
+}
+
+
+
+/* ------------------- PRIVATE STATIC FUNCTIONS ----------------------- */
+
+/* ARGSUSED */
+STATIC void
+xfs_qm_list_init(
+	xfs_dqlist_t	*list,
+	char		*str,
+	int		n)
+{
+	mutex_init(&list->qh_lock, MUTEX_DEFAULT, str);
+	list->qh_next = NULL;
+	list->qh_version = 0;
+	list->qh_nelems = 0;
+}
+
+STATIC void
+xfs_qm_list_destroy(
+	xfs_dqlist_t	*list)
+{
+	mutex_destroy(&(list->qh_lock));
+}
+
+
+/*
+ * Stripped down version of dqattach. This doesn't attach, or even look at the
+ * dquots attached to the inode. The rationale is that there won't be any
+ * attached at the time this is called from quotacheck.
+ */
+STATIC int
+xfs_qm_dqget_noattach(
+	xfs_inode_t	*ip,
+	xfs_dquot_t	**O_udqpp,
+	xfs_dquot_t	**O_gdqpp)
+{
+	int		error;
+	xfs_mount_t	*mp;
+	xfs_dquot_t	*udqp, *gdqp;
+
+	ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+	mp = ip->i_mount;
+	udqp = NULL;
+	gdqp = NULL;
+
+	if (XFS_IS_UQUOTA_ON(mp)) {
+		ASSERT(ip->i_udquot == NULL);
+		/*
+		 * We want the dquot allocated if it doesn't exist.
+		 */
+		if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_uid, XFS_DQ_USER,
+					 XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN,
+					 &udqp))) {
+			/*
+			 * Shouldn't be able to turn off quotas here.
+			 */
+			ASSERT(error != ESRCH);
+			ASSERT(error != ENOENT);
+			return (error);
+		}
+		ASSERT(udqp);
+	}
+
+	if (XFS_IS_GQUOTA_ON(mp)) {
+		ASSERT(ip->i_gdquot == NULL);
+		if (udqp)
+			xfs_dqunlock(udqp);
+		if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_gid, XFS_DQ_GROUP,
+					 XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN,
+					 &gdqp))) {
+			if (udqp)
+				xfs_qm_dqrele(udqp);
+			ASSERT(error != ESRCH);
+			ASSERT(error != ENOENT);
+			return (error);
+		}
+		ASSERT(gdqp);
+
+		/* Reacquire the locks in the right order */
+		if (udqp) {
+			if (! xfs_qm_dqlock_nowait(udqp)) {
+				xfs_dqunlock(gdqp);
+				xfs_dqlock(udqp);
+				xfs_dqlock(gdqp);
+			}
+		}
+	}
+
+	*O_udqpp = udqp;
+	*O_gdqpp = gdqp;
+
+#ifdef QUOTADEBUG
+	if (udqp) ASSERT(XFS_DQ_IS_LOCKED(udqp));
+	if (gdqp) ASSERT(XFS_DQ_IS_LOCKED(gdqp));
+#endif
+	return (0);
+}
+
+/*
+ * Create an inode and return with a reference already taken, but unlocked
+ * This is how we create quota inodes
+ */
+STATIC int
+xfs_qm_qino_alloc(
+	xfs_mount_t	*mp,
+	xfs_inode_t	**ip,
+	__int64_t	sbfields,
+	uint		flags)
+{
+	xfs_trans_t	*tp;
+	int		error;
+	unsigned long s;
+	cred_t		zerocr;
+	int		committed;
+
+	tp = xfs_trans_alloc(mp,XFS_TRANS_QM_QINOCREATE);
+	if ((error = xfs_trans_reserve(tp,
+				      XFS_QM_QINOCREATE_SPACE_RES(mp),
+				      XFS_CREATE_LOG_RES(mp), 0,
+				      XFS_TRANS_PERM_LOG_RES,
+				      XFS_CREATE_LOG_COUNT))) {
+		xfs_trans_cancel(tp, 0);
+		return (error);
+	}
+	bzero(&zerocr, sizeof(zerocr));
+
+	if ((error = xfs_dir_ialloc(&tp, mp->m_rootip, IFREG, 1, mp->m_dev,
+				   &zerocr, 0, 1, ip, &committed))) {
+		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
+				 XFS_TRANS_ABORT);
+		return (error);
+	}
+
+	/*
+	 * Keep an extra reference to this quota inode. This inode is
+	 * locked exclusively and joined to the transaction already.
+	 */
+	ASSERT(XFS_ISLOCKED_INODE_EXCL(*ip));
+	VN_HOLD(XFS_ITOV((*ip)));
+
+	/*
+	 * Make the changes in the superblock, and log those too.
+	 * sbfields arg may contain fields other than *QUOTINO;
+	 * VERSIONNUM for example.
+	 */
+	s = XFS_SB_LOCK(mp);
+	if (flags & XFS_QMOPT_SBVERSION) {
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+		unsigned oldv = mp->m_sb.sb_versionnum;
+#endif
+		ASSERT(!XFS_SB_VERSION_HASQUOTA(&mp->m_sb));
+		ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+				   XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
+		       (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+			XFS_SB_GQUOTINO | XFS_SB_QFLAGS));
+
+		XFS_SB_VERSION_ADDQUOTA(&mp->m_sb);
+		mp->m_sb.sb_uquotino = NULLFSINO;
+		mp->m_sb.sb_gquotino = NULLFSINO;
+
+		/* qflags will get updated _after_ quotacheck */
+		mp->m_sb.sb_qflags = 0;
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+		cmn_err(CE_NOTE,
+			"Old superblock version %x, converting to %x.",
+			oldv, mp->m_sb.sb_versionnum);
+#endif
+	}
+	if (flags & XFS_QMOPT_UQUOTA)
+		mp->m_sb.sb_uquotino = (*ip)->i_ino;
+	else
+		mp->m_sb.sb_gquotino = (*ip)->i_ino;
+	XFS_SB_UNLOCK(mp, s);
+	xfs_mod_sb(tp, sbfields);
+
+	if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES,
+				     NULL))) {
+		xfs_fs_cmn_err(CE_ALERT, mp, "XFS qino_alloc failed!");
+		return (error);
+	}
+	return (0);
+}
+
+
+STATIC int
+xfs_qm_reset_dqcounts(
+	xfs_mount_t	*mp,
+	xfs_buf_t	*bp,
+	xfs_dqid_t	id,
+	uint		type)
+{
+	xfs_disk_dquot_t	*ddq;
+	int			j;
+
+	xfs_buftrace("RESET DQUOTS", bp);
+	/*
+	 * Reset all counters and timers. They'll be
+	 * started afresh by xfs_qm_quotacheck.
+	 */
+#ifdef DEBUG
+	j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
+	do_div(j, sizeof(xfs_dqblk_t));
+	ASSERT(XFS_QM_DQPERBLK(mp) == j);
+#endif
+	ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp);
+	for (j = 0; j < XFS_QM_DQPERBLK(mp); j++) {
+		/*
+		 * Do a sanity check, and if needed, repair the dqblk. Don't
+		 * output any warnings because it's perfectly possible to
+		 * find unitialized dquot blks. See comment in xfs_qm_dqcheck.
+		 */
+		(void) xfs_qm_dqcheck(ddq, id+j, type, XFS_QMOPT_DQREPAIR,
+				      "xfs_quotacheck");
+		INT_SET(ddq->d_bcount, ARCH_CONVERT, 0ULL);
+		INT_SET(ddq->d_icount, ARCH_CONVERT, 0ULL);
+		INT_SET(ddq->d_rtbcount, ARCH_CONVERT, 0ULL);
+		INT_SET(ddq->d_btimer, ARCH_CONVERT, (time_t)0);
+		INT_SET(ddq->d_itimer, ARCH_CONVERT, (time_t)0);
+		INT_SET(ddq->d_bwarns, ARCH_CONVERT, 0UL);
+		INT_SET(ddq->d_iwarns, ARCH_CONVERT, 0UL);
+		ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1);
+	}
+
+	return (0);
+}
+
+STATIC int
+xfs_qm_dqiter_bufs(
+	xfs_mount_t	*mp,
+	xfs_dqid_t	firstid,
+	xfs_fsblock_t	bno,
+	xfs_filblks_t	blkcnt,
+	uint		flags)
+{
+	xfs_buf_t	*bp;
+	int		error;
+	int		notcommitted;
+	int		incr;
+
+	ASSERT(blkcnt > 0);
+	notcommitted = 0;
+	incr = (blkcnt > XFS_QM_MAX_DQCLUSTER_LOGSZ) ?
+		XFS_QM_MAX_DQCLUSTER_LOGSZ : blkcnt;
+	error = 0;
+
+	/*
+	 * Blkcnt arg can be a very big number, and might even be
+	 * larger than the log itself. So, we have to break it up into
+	 * manageable-sized transactions.
+	 * Note that we don't start a permanent transaction here; we might
+	 * not be able to get a log reservation for the whole thing up front,
+	 * and we don't really care to either, because we just discard
+	 * everything if we were to crash in the middle of this loop.
+	 */
+	while (blkcnt--) {
+		error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+			      XFS_FSB_TO_DADDR(mp, bno),
+			      (int)XFS_QI_DQCHUNKLEN(mp), 0, &bp);
+		if (error)
+			break;
+
+		(void) xfs_qm_reset_dqcounts(mp, bp, firstid,
+					     flags & XFS_QMOPT_UQUOTA ?
+					     XFS_DQ_USER : XFS_DQ_GROUP);
+		xfs_bdwrite(mp, bp);
+		/*
+		 * When quotachecking the root filesystem,
+		 * we may not have bdflush, and we may fill
+		 * up all available freebufs.
+		 * The workaround here is to push on the
+		 * log and do a bflush on the rootdev
+		 * periodically.
+		 */
+		if (mp->m_flags & XFS_MOUNT_ROOTQCHECK) {
+			if (++notcommitted == incr) {
+				xfs_log_force(mp, (xfs_lsn_t)0,
+					      XFS_LOG_FORCE | XFS_LOG_SYNC);
+				XFS_bflush(mp->m_ddev_targp);
+				notcommitted = 0;
+			}
+		}
+		/*
+		 * goto the next block.
+		 */
+		bno++;
+		firstid += XFS_QM_DQPERBLK(mp);
+	}
+	return (error);
+}
+
+/*
+ * Iterate over all allocated USR/GRP dquots in the system, calling a
+ * caller supplied function for every chunk of dquots that we find.
+ */
+STATIC int
+xfs_qm_dqiterate(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*qip,
+	uint		flags)
+{
+	xfs_bmbt_irec_t		*map;
+	int			i, nmaps;	/* number of map entries */
+	int			error;		/* return value */
+	xfs_fileoff_t		lblkno;
+	xfs_filblks_t		maxlblkcnt;
+	xfs_dqid_t		firstid;
+	xfs_fsblock_t		rablkno;
+	xfs_filblks_t		rablkcnt;
+
+	error = 0;
+	/*
+	 * This looks racey, but we can't keep an inode lock across a
+	 * trans_reserve. But, this gets called during quotacheck, and that
+	 * happens only at mount time which is single threaded.
+	 */
+	if (qip->i_d.di_nblocks == 0)
+		return (0);
+
+	map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP);
+
+	lblkno = 0;
+	maxlblkcnt = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAX_FILE_OFFSET);
+	do {
+		nmaps = XFS_DQITER_MAP_SIZE;
+		/*
+		 * We aren't changing the inode itself. Just changing
+		 * some of its data. No new blocks are added here, and
+		 * the inode is never added to the transaction.
+		 */
+		xfs_ilock(qip, XFS_ILOCK_SHARED);
+		error = xfs_bmapi(NULL, qip, lblkno,
+				  maxlblkcnt - lblkno,
+				  XFS_BMAPI_METADATA,
+				  NULL,
+				  0, map, &nmaps, NULL);
+		xfs_iunlock(qip, XFS_ILOCK_SHARED);
+		if (error)
+			break;
+
+		ASSERT(nmaps <= XFS_DQITER_MAP_SIZE);
+		for (i = 0; i < nmaps; i++) {
+			ASSERT(map[i].br_startblock != DELAYSTARTBLOCK);
+			ASSERT(map[i].br_blockcount);
+
+
+			lblkno += map[i].br_blockcount;
+
+			if (map[i].br_startblock == HOLESTARTBLOCK)
+				continue;
+
+			firstid = (xfs_dqid_t) map[i].br_startoff *
+				XFS_QM_DQPERBLK(mp);
+			/*
+			 * Do a read-ahead on the next extent.
+			 */
+			if ((i+1 < nmaps) &&
+			    (map[i+1].br_startblock != HOLESTARTBLOCK)) {
+				rablkcnt =  map[i+1].br_blockcount;
+				rablkno = map[i+1].br_startblock;
+				while (rablkcnt--) {
+					xfs_baread(mp->m_ddev_targp,
+					       XFS_FSB_TO_DADDR(mp, rablkno),
+					       (int)XFS_QI_DQCHUNKLEN(mp));
+					rablkno++;
+				}
+			}
+			/*
+			 * Iterate thru all the blks in the extent and
+			 * reset the counters of all the dquots inside them.
+			 */
+			if ((error = xfs_qm_dqiter_bufs(mp,
+						       firstid,
+						       map[i].br_startblock,
+						       map[i].br_blockcount,
+						       flags))) {
+				break;
+			}
+		}
+
+		if (error)
+			break;
+	} while (nmaps > 0);
+
+	kmem_free(map, XFS_DQITER_MAP_SIZE * sizeof(*map));
+
+	return (error);
+}
+
+/*
+ * Called by dqusage_adjust in doing a quotacheck.
+ * Given the inode, and a dquot (either USR or GRP, doesn't matter),
+ * this updates its incore copy as well as the buffer copy. This is
+ * so that once the quotacheck is done, we can just log all the buffers,
+ * as opposed to logging numerous updates to individual dquots.
+ */
+STATIC void
+xfs_qm_quotacheck_dqadjust(
+	xfs_dquot_t		*dqp,
+	xfs_qcnt_t		nblks,
+	xfs_qcnt_t		rtblks)
+{
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	xfs_dqtrace_entry(dqp, "QCHECK DQADJUST");
+	/*
+	 * Adjust the inode count and the block count to reflect this inode's
+	 * resource usage.
+	 */
+	INT_MOD(dqp->q_core.d_icount, ARCH_CONVERT, +1);
+	dqp->q_res_icount++;
+	if (nblks) {
+		INT_MOD(dqp->q_core.d_bcount, ARCH_CONVERT, nblks);
+		dqp->q_res_bcount += nblks;
+	}
+	if (rtblks) {
+		INT_MOD(dqp->q_core.d_rtbcount, ARCH_CONVERT, rtblks);
+		dqp->q_res_rtbcount += rtblks;
+	}
+
+	/*
+	 * Adjust the timers since we just changed usages
+	 */
+	if (! XFS_IS_SUSER_DQUOT(dqp))
+		xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core);
+
+	dqp->dq_flags |= XFS_DQ_DIRTY;
+}
+
+STATIC int
+xfs_qm_get_rtblks(
+	xfs_inode_t	*ip,
+	xfs_qcnt_t	*O_rtblks)
+{
+	xfs_filblks_t	rtblks;			/* total rt blks */
+	xfs_ifork_t	*ifp;			/* inode fork pointer */
+	xfs_extnum_t	nextents;		/* number of extent entries */
+	xfs_bmbt_rec_t	*base;			/* base of extent array */
+	xfs_bmbt_rec_t	*ep;			/* pointer to an extent entry */
+	int		error;
+
+	ASSERT(XFS_IS_REALTIME_INODE(ip));
+	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK)))
+			return (error);
+	}
+	rtblks = 0;
+	nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+	base = &ifp->if_u1.if_extents[0];
+	for (ep = base; ep < &base[nextents]; ep++)
+		rtblks += xfs_bmbt_get_blockcount(ep);
+	*O_rtblks = (xfs_qcnt_t)rtblks;
+	return (0);
+}
+
+/*
+ * callback routine supplied to bulkstat(). Given an inumber, find its
+ * dquots and update them to account for resources taken by that inode.
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_dqusage_adjust(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_trans_t	*tp,		/* transaction pointer - NULL */
+	xfs_ino_t	ino,		/* inode number to get data for */
+	void		*buffer,	/* not used */
+	xfs_daddr_t	bno,		/* starting block of inode cluster */
+	void		*dip,		/* on-disk inode pointer (not used) */
+	int		*res)		/* result code value */
+{
+	xfs_inode_t	*ip;
+	xfs_dquot_t	*udqp, *gdqp;
+	xfs_qcnt_t	nblks, rtblks;
+	int		error;
+
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	/*
+	 * rootino must have its resources accounted for, not so with the quota
+	 * inodes.
+	 */
+	if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
+		*res = BULKSTAT_RV_NOTHING;
+		return XFS_ERROR(EINVAL);
+	}
+
+	/*
+	 * We don't _need_ to take the ilock EXCL. However, the xfs_qm_dqget
+	 * interface expects the inode to be exclusively locked because that's
+	 * the case in all other instances. It's OK that we do this because
+	 * quotacheck is done only at mount time.
+	 */
+	if ((error = xfs_iget(mp, tp, ino, XFS_ILOCK_EXCL, &ip, bno))) {
+		*res = BULKSTAT_RV_NOTHING;
+		return (error);
+	}
+
+	if (ip->i_d.di_mode == 0) {
+		xfs_iput_new(ip, XFS_ILOCK_EXCL);
+		*res = BULKSTAT_RV_NOTHING;
+		return XFS_ERROR(ENOENT);
+	}
+
+	/*
+	 * Obtain the locked dquots. In case of an error (eg. allocation
+	 * fails for ENOSPC), we return the negative of the error number
+	 * to bulkstat, so that it can get propagated to quotacheck() and
+	 * making us disable quotas for the file system.
+	 */
+	if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) {
+		xfs_iput(ip, XFS_ILOCK_EXCL);
+		*res = BULKSTAT_RV_GIVEUP;
+		return (error);
+	}
+
+	rtblks = 0;
+	if (! XFS_IS_REALTIME_INODE(ip)) {
+		nblks = (xfs_qcnt_t)ip->i_d.di_nblocks;
+	} else {
+		/*
+		 * Walk thru the extent list and count the realtime blocks.
+		 */
+		if ((error = xfs_qm_get_rtblks(ip, &rtblks))) {
+			xfs_iput(ip, XFS_ILOCK_EXCL);
+			if (udqp)
+				xfs_qm_dqput(udqp);
+			if (gdqp)
+				xfs_qm_dqput(gdqp);
+			*res = BULKSTAT_RV_GIVEUP;
+			return (error);
+		}
+		nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
+	}
+	ASSERT(ip->i_delayed_blks == 0);
+
+	/*
+	 * We can't release the inode while holding its dquot locks.
+	 * The inode can go into inactive and might try to acquire the dquotlocks.
+	 * So, just unlock here and do a vn_rele at the end.
+	 */
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	/*
+	 * Add the (disk blocks and inode) resources occupied by this
+	 * inode to its dquots. We do this adjustment in the incore dquot,
+	 * and also copy the changes to its buffer.
+	 * We don't care about putting these changes in a transaction
+	 * envelope because if we crash in the middle of a 'quotacheck'
+	 * we have to start from the beginning anyway.
+	 * Once we're done, we'll log all the dquot bufs.
+	 *
+	 * The *QUOTA_ON checks below may look pretty racey, but quotachecks
+	 * and quotaoffs don't race. (Quotachecks happen at mount time only).
+	 */
+	if (XFS_IS_UQUOTA_ON(mp)) {
+		ASSERT(udqp);
+		xfs_qm_quotacheck_dqadjust(udqp, nblks, rtblks);
+		xfs_qm_dqput(udqp);
+	}
+	if (XFS_IS_GQUOTA_ON(mp)) {
+		ASSERT(gdqp);
+		xfs_qm_quotacheck_dqadjust(gdqp, nblks, rtblks);
+		xfs_qm_dqput(gdqp);
+	}
+	/*
+	 * Now release the inode. This will send it to 'inactive', and
+	 * possibly even free blocks.
+	 */
+	VN_RELE(XFS_ITOV(ip));
+
+	/*
+	 * Goto next inode.
+	 */
+	*res = BULKSTAT_RV_DIDONE;
+	return (0);
+}
+
+/*
+ * Walk thru all the filesystem inodes and construct a consistent view
+ * of the disk quota world.
+ */
+STATIC int
+xfs_qm_quotacheck(
+	xfs_mount_t	*mp)
+{
+	int		done, count, error;
+	xfs_ino_t	lastino;
+	size_t		structsz;
+	xfs_inode_t	*uip, *gip;
+	uint		flags;
+
+	count = INT_MAX;
+	structsz = 1;
+	lastino = 0;
+	flags = 0;
+
+	ASSERT(XFS_QI_UQIP(mp) || XFS_QI_GQIP(mp));
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	/*
+	 * There should be no cached dquots. The (simplistic) quotacheck
+	 * algorithm doesn't like that.
+	 */
+	ASSERT(XFS_QI_MPLNDQUOTS(mp) == 0);
+
+	cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname);
+
+	/*
+	 * First we go thru all the dquots on disk, USR and GRP, and reset
+	 * their counters to zero. We need a clean slate.
+	 * We don't log our changes till later.
+	 */
+	if ((uip = XFS_QI_UQIP(mp))) {
+		if ((error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA)))
+			goto error_return;
+		flags |= XFS_UQUOTA_CHKD;
+	}
+
+	if ((gip = XFS_QI_GQIP(mp))) {
+		if ((error = xfs_qm_dqiterate(mp, gip, XFS_QMOPT_GQUOTA)))
+			goto error_return;
+		flags |= XFS_GQUOTA_CHKD;
+	}
+
+	do {
+		/*
+		 * Iterate thru all the inodes in the file system,
+		 * adjusting the corresponding dquot counters in core.
+		 */
+		if ((error = xfs_bulkstat(mp, NULL, &lastino, &count,
+				     xfs_qm_dqusage_adjust,
+				     structsz, NULL,
+				     BULKSTAT_FG_IGET|BULKSTAT_FG_VFSLOCKED,
+				     &done)))
+			break;
+
+	} while (! done);
+
+	/*
+	 * We can get this error if we couldn't do a dquot allocation inside
+	 * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the
+	 * dirty dquots that might be cached, we just want to get rid of them
+	 * and turn quotaoff. The dquots won't be attached to any of the inodes
+	 * at this point (because we intentionally didn't in dqget_noattach).
+	 */
+	if (error) {
+		xfs_qm_dqpurge_all(mp,
+				   XFS_QMOPT_UQUOTA|XFS_QMOPT_GQUOTA|
+				   XFS_QMOPT_QUOTAOFF);
+		goto error_return;
+	}
+	/*
+	 * We've made all the changes that we need to make incore.
+	 * Now flush_them down to disk buffers.
+	 */
+	xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI);
+
+	/*
+	 * We didn't log anything, because if we crashed, we'll have to
+	 * start the quotacheck from scratch anyway. However, we must make
+	 * sure that our dquot changes are secure before we put the
+	 * quotacheck'd stamp on the superblock. So, here we do a synchronous
+	 * flush.
+	 */
+	XFS_bflush(mp->m_ddev_targp);
+
+	/*
+	 * If one type of quotas is off, then it will lose its
+	 * quotachecked status, since we won't be doing accounting for
+	 * that type anymore.
+	 */
+	mp->m_qflags &= ~(XFS_GQUOTA_CHKD | XFS_UQUOTA_CHKD);
+	mp->m_qflags |= flags;
+
+#ifdef QUOTADEBUG
+	XQM_LIST_PRINT(&(XFS_QI_MPL_LIST(mp)), MPL_NEXT, "++++ Mp list +++");
+#endif
+
+ error_return:
+	cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname);
+	mp->m_flags &= ~(XFS_MOUNT_ROOTQCHECK);
+	return (error);
+}
+
+/*
+ * This is called after the superblock has been read in and we're ready to
+ * iget the quota inodes.
+ */
+STATIC int
+xfs_qm_init_quotainos(
+	xfs_mount_t	*mp)
+{
+	xfs_inode_t	*uip, *gip;
+	int		error;
+	__int64_t	sbflags;
+	uint		flags;
+
+	ASSERT(mp->m_quotainfo);
+	uip = gip = NULL;
+	sbflags = 0;
+	flags = 0;
+
+	/*
+	 * Get the uquota and gquota inodes
+	 */
+	if (XFS_SB_VERSION_HASQUOTA(&mp->m_sb)) {
+		if (XFS_IS_UQUOTA_ON(mp) &&
+		    mp->m_sb.sb_uquotino != NULLFSINO) {
+			ASSERT(mp->m_sb.sb_uquotino > 0);
+			if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
+					     0, &uip, 0)))
+				return XFS_ERROR(error);
+		}
+		if (XFS_IS_GQUOTA_ON(mp) &&
+		    mp->m_sb.sb_gquotino != NULLFSINO) {
+			ASSERT(mp->m_sb.sb_gquotino > 0);
+			if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+					     0, &gip, 0))) {
+				if (uip)
+					VN_RELE(XFS_ITOV(uip));
+				return XFS_ERROR(error);
+			}
+		}
+	} else {
+		flags |= XFS_QMOPT_SBVERSION;
+		sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+			    XFS_SB_GQUOTINO | XFS_SB_QFLAGS);
+	}
+
+	/*
+	 * Create the two inodes, if they don't exist already. The changes
+	 * made above will get added to a transaction and logged in one of
+	 * the qino_alloc calls below.
+	 */
+	if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
+		if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY)
+			return XFS_ERROR(EROFS);
+		if ((error = xfs_qm_qino_alloc(mp, &uip,
+					      sbflags | XFS_SB_UQUOTINO,
+					      flags | XFS_QMOPT_UQUOTA)))
+			return XFS_ERROR(error);
+
+		flags &= ~XFS_QMOPT_SBVERSION;
+	}
+	if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) {
+		if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY) {
+			if (uip)
+				VN_RELE(XFS_ITOV(uip));
+			return XFS_ERROR(EROFS);
+		}
+		if ((error = xfs_qm_qino_alloc(mp, &gip,
+					      sbflags | XFS_SB_GQUOTINO,
+					      flags | XFS_QMOPT_GQUOTA))) {
+			if (uip)
+				VN_RELE(XFS_ITOV(uip));
+
+			return XFS_ERROR(error);
+		}
+	}
+
+	XFS_QI_UQIP(mp) = uip;
+	XFS_QI_GQIP(mp) = gip;
+
+	return (0);
+}
+
+
+/*
+ * Traverse the freelist of dquots and attempt to reclaim a maximum of
+ * 'howmany' dquots. This operation races with dqlookup(), and attempts to
+ * favor the lookup function ...
+ * XXXsup merge this with qm_reclaim_one().
+ */
+STATIC int
+xfs_qm_shake_freelist(
+	int howmany)
+{
+	int		nreclaimed;
+	xfs_dqhash_t	*hash;
+	xfs_dquot_t	*dqp, *nextdqp;
+	int		restarts;
+	int		nflushes;
+
+	if (howmany <= 0)
+		return (0);
+
+	nreclaimed = 0;
+	restarts = 0;
+	nflushes = 0;
+
+#ifdef QUOTADEBUG
+	printk("Shake free 0x%x\n", howmany);
+#endif
+	/* lock order is : hashchainlock, freelistlock, mplistlock */
+ tryagain:
+	xfs_qm_freelist_lock(xfs_Gqm);
+
+	for (dqp = xfs_Gqm->qm_dqfreelist.qh_next;
+	     ((dqp != (xfs_dquot_t *) &xfs_Gqm->qm_dqfreelist) &&
+	      nreclaimed < howmany); ) {
+		xfs_dqlock(dqp);
+
+		/*
+		 * We are racing with dqlookup here. Naturally we don't
+		 * want to reclaim a dquot that lookup wants.
+		 */
+		if (dqp->dq_flags & XFS_DQ_WANT) {
+			xfs_dqunlock(dqp);
+			xfs_qm_freelist_unlock(xfs_Gqm);
+			if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
+				return (nreclaimed != howmany);
+			XFS_STATS_INC(xfsstats.xs_qm_dqwants);
+			goto tryagain;
+		}
+
+		/*
+		 * If the dquot is inactive, we are assured that it is
+		 * not on the mplist or the hashlist, and that makes our
+		 * life easier.
+		 */
+		if (dqp->dq_flags & XFS_DQ_INACTIVE) {
+			ASSERT(dqp->q_mount == NULL);
+			ASSERT(! XFS_DQ_IS_DIRTY(dqp));
+			ASSERT(dqp->HL_PREVP == NULL);
+			ASSERT(dqp->MPL_PREVP == NULL);
+			XFS_STATS_INC(xfsstats.xs_qm_dqinact_reclaims);
+			nextdqp = dqp->dq_flnext;
+			goto off_freelist;
+		}
+
+		ASSERT(dqp->MPL_PREVP);
+		/*
+		 * Try to grab the flush lock. If this dquot is in the process of
+		 * getting flushed to disk, we don't want to reclaim it.
+		 */
+		if (! xfs_qm_dqflock_nowait(dqp)) {
+			xfs_dqunlock(dqp);
+			dqp = dqp->dq_flnext;
+			continue;
+		}
+
+		/*
+		 * We have the flush lock so we know that this is not in the
+		 * process of being flushed. So, if this is dirty, flush it
+		 * DELWRI so that we don't get a freelist infested with
+		 * dirty dquots.
+		 */
+		if (XFS_DQ_IS_DIRTY(dqp)) {
+			xfs_dqtrace_entry(dqp, "DQSHAKE: DQDIRTY");
+			/*
+			 * We'll be doing a dqflush, and it is
+			 * possible to fill up the entire buffer cache
+			 * with dirty delayed write buffers when doing
+			 * this on a root filesystem, if bdflush isn't
+			 * running. So, do a flush periodically.
+			 */
+			if (dqp->q_mount->m_flags & XFS_MOUNT_ROOTQCHECK) {
+				if (!(++nflushes % XFS_QM_MAX_DQCLUSTER_LOGSZ)){
+					xfs_log_force(dqp->q_mount, (xfs_lsn_t)0,
+						 XFS_LOG_FORCE | XFS_LOG_SYNC);
+					XFS_bflush(dqp->q_mount->m_ddev_targp);
+				}
+			}
+			/*
+			 * We flush it delayed write, so don't bother
+			 * releasing the mplock.
+			 */
+			(void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+			xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
+			dqp = dqp->dq_flnext;
+			continue;
+		}
+		/*
+		 * We're trying to get the hashlock out of order. This races
+		 * with dqlookup; so, we giveup and goto the next dquot if
+		 * we couldn't get the hashlock. This way, we won't starve
+		 * a dqlookup process that holds the hashlock that is
+		 * waiting for the freelist lock.
+		 */
+		if (! xfs_qm_dqhashlock_nowait(dqp)) {
+			xfs_dqfunlock(dqp);
+			xfs_dqunlock(dqp);
+			dqp = dqp->dq_flnext;
+			continue;
+		}
+		/*
+		 * This races with dquot allocation code as well as dqflush_all
+		 * and reclaim code. So, if we failed to grab the mplist lock,
+		 * giveup everything and start over.
+		 */
+		hash = dqp->q_hash;
+		ASSERT(hash);
+		if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
+			/* XXX put a sentinel so that we can come back here */
+			xfs_dqfunlock(dqp);
+			xfs_dqunlock(dqp);
+			XFS_DQ_HASH_UNLOCK(hash);
+			xfs_qm_freelist_unlock(xfs_Gqm);
+			if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
+				return (nreclaimed != howmany);
+			goto tryagain;
+		}
+		xfs_dqtrace_entry(dqp, "DQSHAKE: UNLINKING");
+#ifdef QUOTADEBUG
+		printk("Shake 0x%p, ID 0x%x\n", dqp, INT_GET(dqp->q_core.d_id, ARCH_CONVERT));
+#endif
+		ASSERT(dqp->q_nrefs == 0);
+		nextdqp = dqp->dq_flnext;
+		XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
+		XQM_HASHLIST_REMOVE(hash, dqp);
+		xfs_dqfunlock(dqp);
+		xfs_qm_mplist_unlock(dqp->q_mount);
+		XFS_DQ_HASH_UNLOCK(hash);
+
+ off_freelist:
+		XQM_FREELIST_REMOVE(dqp);
+		xfs_dqunlock(dqp);
+		nreclaimed++;
+		XFS_STATS_INC(xfsstats.xs_qm_dqshake_reclaims);
+		xfs_qm_dqdestroy(dqp);
+		dqp = nextdqp;
+	}
+	xfs_qm_freelist_unlock(xfs_Gqm);
+	return (nreclaimed != howmany);
+}
+
+
+/*
+ * The shake manager routine called by shaked() when memory is
+ * running low.
+ */
+/* ARGSUSED */
+STATIC void
+xfs_qm_shake(void)
+{
+	int	ndqused, nfree, n;
+
+	if (!xfs_Gqm)
+		return;
+
+	nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */
+	/* incore dquots in all f/s's */
+	ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
+
+	ASSERT(ndqused >= 0);
+
+	if (nfree <= ndqused && nfree < ndquot)
+		return;
+
+	ndqused *= xfs_Gqm->qm_dqfree_ratio;	/* target # of free dquots */
+	n = nfree - ndqused - ndquot;		/* # over target */
+
+	(void) xfs_qm_shake_freelist(MAX(nfree, n));
+}
+
+
+/*
+ * Just pop the least recently used dquot off the freelist and
+ * recycle it. The returned dquot is locked.
+ */
+STATIC xfs_dquot_t *
+xfs_qm_dqreclaim_one(void)
+{
+	xfs_dquot_t	*dqpout;
+	xfs_dquot_t	*dqp;
+	int		restarts;
+	int		nflushes;
+
+	restarts = 0;
+	dqpout = NULL;
+	nflushes = 0;
+
+	/* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
+ startagain:
+	xfs_qm_freelist_lock(xfs_Gqm);
+
+	FOREACH_DQUOT_IN_FREELIST(dqp, &(xfs_Gqm->qm_dqfreelist)) {
+		xfs_dqlock(dqp);
+
+		/*
+		 * We are racing with dqlookup here. Naturally we don't
+		 * want to reclaim a dquot that lookup wants. We release the
+		 * freelist lock and start over, so that lookup will grab
+		 * both the dquot and the freelistlock.
+		 */
+		if (dqp->dq_flags & XFS_DQ_WANT) {
+			ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
+			xfs_dqtrace_entry(dqp, "DQRECLAIM: DQWANT");
+			xfs_dqunlock(dqp);
+			xfs_qm_freelist_unlock(xfs_Gqm);
+			if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
+				return (NULL);
+			XFS_STATS_INC(xfsstats.xs_qm_dqwants);
+			goto startagain;
+		}
+
+		/*
+		 * If the dquot is inactive, we are assured that it is
+		 * not on the mplist or the hashlist, and that makes our
+		 * life easier.
+		 */
+		if (dqp->dq_flags & XFS_DQ_INACTIVE) {
+			ASSERT(dqp->q_mount == NULL);
+			ASSERT(! XFS_DQ_IS_DIRTY(dqp));
+			ASSERT(dqp->HL_PREVP == NULL);
+			ASSERT(dqp->MPL_PREVP == NULL);
+			XQM_FREELIST_REMOVE(dqp);
+			xfs_dqunlock(dqp);
+			dqpout = dqp;
+			XFS_STATS_INC(xfsstats.xs_qm_dqinact_reclaims);
+			break;
+		}
+
+		ASSERT(dqp->q_hash);
+		ASSERT(dqp->MPL_PREVP);
+
+		/*
+		 * Try to grab the flush lock. If this dquot is in the process of
+		 * getting flushed to disk, we don't want to reclaim it.
+		 */
+		if (! xfs_qm_dqflock_nowait(dqp)) {
+			xfs_dqunlock(dqp);
+			continue;
+		}
+
+		/*
+		 * We have the flush lock so we know that this is not in the
+		 * process of being flushed. So, if this is dirty, flush it
+		 * DELWRI so that we don't get a freelist infested with
+		 * dirty dquots.
+		 */
+		if (XFS_DQ_IS_DIRTY(dqp)) {
+			xfs_dqtrace_entry(dqp, "DQRECLAIM: DQDIRTY");
+			/*
+			 * We'll be doing a dqflush, and it is
+			 * possible to fill up the entire buffer cache
+			 * with dirty delayed write buffers when doing
+			 * this on a root filesystem, if bdflush isn't
+			 * running. So, do a flush periodically.
+			 */
+			if (dqp->q_mount->m_flags & XFS_MOUNT_ROOTQCHECK) {
+				if (!(++nflushes % XFS_QM_MAX_DQCLUSTER_LOGSZ)) {
+					xfs_log_force(dqp->q_mount, (xfs_lsn_t)0,
+						XFS_LOG_FORCE | XFS_LOG_SYNC);
+					XFS_bflush(dqp->q_mount->m_ddev_targp);
+				}
+			}
+
+			/*
+			 * We flush it delayed write, so don't bother
+			 * releasing the freelist lock.
+			 */
+			(void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
+			xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
+			continue;
+		}
+
+		if (! xfs_qm_mplist_nowait(dqp->q_mount)) {
+			xfs_dqfunlock(dqp);
+			xfs_dqunlock(dqp);
+			continue;
+		}
+
+		if (! xfs_qm_dqhashlock_nowait(dqp))
+			goto mplistunlock;
+
+		ASSERT(dqp->q_nrefs == 0);
+		xfs_dqtrace_entry(dqp, "DQRECLAIM: UNLINKING");
+		XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp);
+		XQM_HASHLIST_REMOVE(dqp->q_hash, dqp);
+		XQM_FREELIST_REMOVE(dqp);
+		dqpout = dqp;
+		XFS_DQ_HASH_UNLOCK(dqp->q_hash);
+ mplistunlock:
+		xfs_qm_mplist_unlock(dqp->q_mount);
+		xfs_dqfunlock(dqp);
+		xfs_dqunlock(dqp);
+		if (dqpout)
+			break;
+	}
+
+	xfs_qm_freelist_unlock(xfs_Gqm);
+	return (dqpout);
+}
+
+
+/*------------------------------------------------------------------*/
+
+/*
+ * Return a new incore dquot. Depending on the number of
+ * dquots in the system, we either allocate a new one on the kernel heap,
+ * or reclaim a free one.
+ * Return value is B_TRUE if we allocated a new dquot, B_FALSE if we managed
+ * to reclaim an existing one from the freelist.
+ */
+boolean_t
+xfs_qm_dqalloc_incore(
+	xfs_dquot_t **O_dqpp)
+{
+	xfs_dquot_t	*dqp;
+
+	/*
+	 * Check against high water mark to see if we want to pop
+	 * a nincompoop dquot off the freelist.
+	 */
+	if (atomic_read(&xfs_Gqm->qm_totaldquots) >= ndquot) {
+		/*
+		 * Try to recycle a dquot from the freelist.
+		 */
+		if ((dqp = xfs_qm_dqreclaim_one())) {
+			XFS_STATS_INC(xfsstats.xs_qm_dqreclaims);
+			/*
+			 * Just bzero the core here. The rest will get
+			 * reinitialized by caller. XXX we shouldn't even
+			 * do this bzero ...
+			 */
+			bzero(&dqp->q_core, sizeof(dqp->q_core));
+			*O_dqpp = dqp;
+			return (B_FALSE);
+		}
+		XFS_STATS_INC(xfsstats.xs_qm_dqreclaim_misses);
+	}
+
+	/*
+	 * Allocate a brand new dquot on the kernel heap and return it
+	 * to the caller to initialize.
+	 */
+	ASSERT(xfs_Gqm->qm_dqzone != NULL);
+	*O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP);
+	atomic_inc(&xfs_Gqm->qm_totaldquots);
+
+	return (B_TRUE);
+}
+
+
+/*
+ * Start a transaction and write the incore superblock changes to
+ * disk. flags parameter indicates which fields have changed.
+ */
+int
+xfs_qm_write_sb_changes(
+	xfs_mount_t	*mp,
+	__int64_t	flags)
+{
+	xfs_trans_t	*tp;
+	int		error;
+
+#ifdef QUOTADEBUG
+	cmn_err(CE_NOTE,
+		"Writing superblock quota changes :%s",
+		mp->m_fsname);
+#endif
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
+	if ((error = xfs_trans_reserve(tp, 0,
+				      mp->m_sb.sb_sectsize + 128, 0,
+				      0,
+				      XFS_DEFAULT_LOG_COUNT))) {
+		xfs_trans_cancel(tp, 0);
+		return (error);
+	}
+
+	xfs_mod_sb(tp, flags);
+	(void) xfs_trans_commit(tp, 0, NULL);
+
+	return (0);
+}
+
+
+/* --------------- utility functions for vnodeops ---------------- */
+
+
+/*
+ * Given an inode, a uid and gid (from cred_t) make sure that we have
+ * allocated relevant dquot(s) on disk, and that we won't exceed inode
+ * quotas by creating this file.
+ * This also attaches dquot(s) to the given inode after locking it,
+ * and returns the dquots corresponding to the uid and/or gid.
+ *
+ * in	: inode (unlocked)
+ * out	: udquot, gdquot with references taken and unlocked
+ */
+int
+xfs_qm_vop_dqalloc(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*ip,
+	uid_t		uid,
+	gid_t		gid,
+	uint		flags,
+	xfs_dquot_t	**O_udqpp,
+	xfs_dquot_t	**O_gdqpp)
+{
+	int		error;
+	xfs_dquot_t	*uq, *gq;
+	uint		lockflags;
+
+	lockflags = XFS_ILOCK_EXCL;
+	xfs_ilock(ip, lockflags);
+
+	if ((flags & XFS_QMOPT_INHERIT) &&
+	    XFS_INHERIT_GID(ip, XFS_MTOVFS(mp)))
+		gid = ip->i_d.di_gid;
+
+	/*
+	 * Attach the dquot(s) to this inode, doing a dquot allocation
+	 * if necessary. The dquot(s) will not be locked.
+	 */
+	if (XFS_NOT_DQATTACHED(mp, ip)) {
+		if ((error = xfs_qm_dqattach(ip, XFS_QMOPT_DQALLOC |
+					    XFS_QMOPT_ILOCKED))) {
+			xfs_iunlock(ip, lockflags);
+			return (error);
+		}
+	}
+
+	uq = gq = NULL;
+	if ((flags & XFS_QMOPT_UQUOTA) &&
+	    XFS_IS_UQUOTA_ON(mp)) {
+		if (ip->i_d.di_uid != uid) {
+			/*
+			 * What we need is the dquot that has this uid, and
+			 * if we send the inode to dqget, the uid of the inode
+			 * takes priority over what's sent in the uid argument.
+			 * We must unlock inode here before calling dqget if
+			 * we're not sending the inode, because otherwise
+			 * we'll deadlock by doing trans_reserve while
+			 * holding ilock.
+			 */
+			xfs_iunlock(ip, lockflags);
+			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
+						 XFS_DQ_USER,
+						 XFS_QMOPT_DQALLOC |
+						 XFS_QMOPT_DOWARN,
+						 &uq))) {
+				ASSERT(error != ENOENT);
+				return (error);
+			}
+			/*
+			 * Get the ilock in the right order.
+			 */
+			xfs_dqunlock(uq);
+			lockflags = XFS_ILOCK_SHARED;
+			xfs_ilock(ip, lockflags);
+		} else {
+			/*
+			 * Take an extra reference, because we'll return
+			 * this to caller
+			 */
+			ASSERT(ip->i_udquot);
+			uq = ip->i_udquot;
+			xfs_dqlock(uq);
+			XFS_DQHOLD(uq);
+			xfs_dqunlock(uq);
+		}
+	}
+	if ((flags & XFS_QMOPT_GQUOTA) &&
+	    XFS_IS_GQUOTA_ON(mp)) {
+		if (ip->i_d.di_gid != gid) {
+			xfs_iunlock(ip, lockflags);
+			if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
+						 XFS_DQ_GROUP,
+						 XFS_QMOPT_DQALLOC |
+						 XFS_QMOPT_DOWARN,
+						 &gq))) {
+				if (uq)
+					xfs_qm_dqrele(uq);
+				ASSERT(error != ENOENT);
+				return (error);
+			}
+			xfs_dqunlock(gq);
+			lockflags = XFS_ILOCK_SHARED;
+			xfs_ilock(ip, lockflags);
+		} else {
+			ASSERT(ip->i_gdquot);
+			gq = ip->i_gdquot;
+			xfs_dqlock(gq);
+			XFS_DQHOLD(gq);
+			xfs_dqunlock(gq);
+		}
+	}
+	if (uq)
+		xfs_dqtrace_entry_ino(uq, "DQALLOC", ip);
+
+	xfs_iunlock(ip, lockflags);
+	if (O_udqpp)
+		*O_udqpp = uq;
+	else if (uq)
+		xfs_qm_dqrele(uq);
+	if (O_gdqpp)
+		*O_gdqpp = gq;
+	else if (gq)
+		xfs_qm_dqrele(gq);
+	return (0);
+}
+
+/*
+ * Actually transfer ownership, and do dquot modifications.
+ * These were already reserved.
+ */
+xfs_dquot_t *
+xfs_qm_vop_chown(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	xfs_dquot_t	**IO_olddq,
+	xfs_dquot_t	*newdq)
+{
+	xfs_dquot_t	*prevdq;
+	ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+	ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
+
+	/* old dquot */
+	prevdq = *IO_olddq;
+	ASSERT(prevdq);
+	ASSERT(prevdq != newdq);
+
+	xfs_trans_mod_dquot(tp, prevdq,
+			    XFS_TRANS_DQ_BCOUNT,
+			    -(ip->i_d.di_nblocks));
+	xfs_trans_mod_dquot(tp, prevdq,
+			    XFS_TRANS_DQ_ICOUNT,
+			    -1);
+
+	/* the sparkling new dquot */
+	xfs_trans_mod_dquot(tp, newdq,
+			    XFS_TRANS_DQ_BCOUNT,
+			    ip->i_d.di_nblocks);
+	xfs_trans_mod_dquot(tp, newdq,
+			    XFS_TRANS_DQ_ICOUNT,
+			    1);
+
+	/*
+	 * Take an extra reference, because the inode
+	 * is going to keep this dquot pointer even
+	 * after the trans_commit.
+	 */
+	xfs_dqlock(newdq);
+	XFS_DQHOLD(newdq);
+	xfs_dqunlock(newdq);
+	*IO_olddq = newdq;
+
+	return (prevdq);
+}
+
+/*
+ * Quota reservations for setattr(AT_UID|AT_GID).
+ */
+int
+xfs_qm_vop_chown_reserve(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	xfs_dquot_t	*udqp,
+	xfs_dquot_t	*gdqp,
+	uint		privileged)
+{
+	int		error;
+	xfs_mount_t	*mp;
+	uint		delblks;
+	xfs_dquot_t	*unresudq, *unresgdq, *delblksudq, *delblksgdq;
+
+	ASSERT(XFS_ISLOCKED_INODE(ip));
+	mp = ip->i_mount;
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	delblks = ip->i_delayed_blks;
+	delblksudq = delblksgdq = unresudq = unresgdq = NULL;
+
+	if (XFS_IS_UQUOTA_ON(mp) && udqp &&
+	    ip->i_d.di_uid != (uid_t)INT_GET(udqp->q_core.d_id, ARCH_CONVERT)) {
+		delblksudq = udqp;
+		/*
+		 * If there are delayed allocation blocks, then we have to
+		 * unreserve those from the old dquot, and add them to the
+		 * new dquot.
+		 */
+		if (delblks) {
+			ASSERT(ip->i_udquot);
+			unresudq = ip->i_udquot;
+		}
+	}
+	if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp &&
+	    ip->i_d.di_gid != INT_GET(gdqp->q_core.d_id, ARCH_CONVERT)) {
+		delblksgdq = gdqp;
+		if (delblks) {
+			ASSERT(ip->i_gdquot);
+			unresgdq = ip->i_gdquot;
+		}
+	}
+
+	if ((error = xfs_trans_reserve_quota(tp, delblksudq,
+					    delblksgdq,
+					    ip->i_d.di_nblocks, 1,
+					    privileged)))
+		return (error);
+
+
+	/*
+	 * Do the delayed blks reservations/unreservations now. Since, these
+	 * are done without the help of a transaction, if a reservation fails
+	 * its previous reservations won't be automatically undone by trans
+	 * code. So, we have to do it manually here.
+	 */
+	if (delblks) {
+		/*
+		 * Do the reservations first. Unreservation can't fail.
+		 */
+		ASSERT(delblksudq || delblksgdq);
+		ASSERT(unresudq || unresgdq);
+		if ((error = xfs_trans_reserve_quota(NULL,
+						    delblksudq, delblksgdq,
+						    (xfs_qcnt_t)delblks, 0,
+						    privileged)))
+			return (error);
+		(void) xfs_trans_unreserve_quota(NULL,
+						 unresudq, unresgdq,
+						 (xfs_qcnt_t)delblks, 0,
+						 0);
+	}
+
+	return (0);
+}
+
+int
+xfs_qm_vop_rename_dqattach(
+	xfs_inode_t	**i_tab)
+{
+	xfs_inode_t	*ip;
+	int		i;
+	int		error;
+
+	ip = i_tab[0];
+
+	if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
+		error = xfs_qm_dqattach(ip, 0);
+		if (error)
+			return (error);
+	}
+	for (i = 1; (i < 4 && i_tab[i]); i++) {
+		/*
+		 * Watch out for duplicate entries in the table.
+		 */
+		if ((ip = i_tab[i]) != i_tab[i-1]) {
+			if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
+				error = xfs_qm_dqattach(ip, 0);
+				if (error)
+					return (error);
+			}
+		}
+	}
+	return (0);
+}
+
+void
+xfs_qm_vop_dqattach_and_dqmod_newinode(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	xfs_dquot_t	*udqp,
+	xfs_dquot_t	*gdqp)
+{
+	ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+	ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
+
+	if (udqp) {
+		xfs_dqlock(udqp);
+		XFS_DQHOLD(udqp);
+		xfs_dqunlock(udqp);
+		ASSERT(ip->i_udquot == NULL);
+		ip->i_udquot = udqp;
+		ASSERT(ip->i_d.di_uid == INT_GET(udqp->q_core.d_id, ARCH_CONVERT));
+		xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
+	}
+	if (gdqp) {
+		xfs_dqlock(gdqp);
+		XFS_DQHOLD(gdqp);
+		xfs_dqunlock(gdqp);
+		ASSERT(ip->i_gdquot == NULL);
+		ip->i_gdquot = gdqp;
+		ASSERT(ip->i_d.di_gid == INT_GET(gdqp->q_core.d_id, ARCH_CONVERT));
+		xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
+	}
+}
+
+/* ------------- list stuff -----------------*/
+void
+xfs_qm_freelist_init(xfs_frlist_t *ql)
+{
+	ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql;
+	mutex_init(&ql->qh_lock, MUTEX_DEFAULT, "dqf");
+	ql->qh_version = 0;
+	ql->qh_nelems = 0;
+}
+
+void
+xfs_qm_freelist_destroy(xfs_frlist_t *ql)
+{
+	xfs_dquot_t	*dqp, *nextdqp;
+
+	mutex_lock(&ql->qh_lock, PINOD);
+	for (dqp = ql->qh_next;
+	     dqp != (xfs_dquot_t *)ql; ) {
+		xfs_dqlock(dqp);
+		nextdqp = dqp->dq_flnext;
+#ifdef QUOTADEBUG
+		printk("FREELIST destroy 0x%p\n", dqp);
+#endif
+		XQM_FREELIST_REMOVE(dqp);
+		xfs_dqunlock(dqp);
+		xfs_qm_dqdestroy(dqp);
+		dqp = nextdqp;
+	}
+	/*
+	 * Don't bother about unlocking.
+	 */
+	mutex_destroy(&ql->qh_lock);
+
+	ASSERT(ql->qh_nelems == 0);
+}
+
+void
+xfs_qm_freelist_insert(xfs_frlist_t *ql, xfs_dquot_t *dq)
+{
+	dq->dq_flnext = ql->qh_next;
+	dq->dq_flprev = (xfs_dquot_t *)ql;
+	ql->qh_next = dq;
+	dq->dq_flnext->dq_flprev = dq;
+	xfs_Gqm->qm_dqfreelist.qh_nelems++;
+	xfs_Gqm->qm_dqfreelist.qh_version++;
+}
+
+void
+xfs_qm_freelist_unlink(xfs_dquot_t *dq)
+{
+	xfs_dquot_t *next = dq->dq_flnext;
+	xfs_dquot_t *prev = dq->dq_flprev;
+
+	next->dq_flprev = prev;
+	prev->dq_flnext = next;
+	dq->dq_flnext = dq->dq_flprev = dq;
+	xfs_Gqm->qm_dqfreelist.qh_nelems--;
+	xfs_Gqm->qm_dqfreelist.qh_version++;
+}
+
+#ifdef QUOTADEBUG
+void
+xfs_qm_freelist_print(xfs_frlist_t *qlist, char *title)
+{
+	xfs_dquot_t *dq;
+	int i = 0;
+	printk("%s (#%d)\n", title, (int) qlist->qh_nelems);
+	FOREACH_DQUOT_IN_FREELIST(dq, qlist) {
+		printk("\t%d.\t\"%d (%s:0x%p)\"\t bcnt = %d, icnt = %d "
+		       "refs = %d\n",
+		       ++i, INT_GET(dq->q_core.d_id, ARCH_CONVERT),
+		       DQFLAGTO_TYPESTR(dq), dq,
+		       (int) INT_GET(dq->q_core.d_bcount, ARCH_CONVERT),
+		       (int) INT_GET(dq->q_core.d_icount, ARCH_CONVERT),
+		       (int) dq->q_nrefs);
+	}
+}
+#endif
+
+void
+xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq)
+{
+	xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq);
+}
+
+int
+xfs_qm_dqhashlock_nowait(
+	xfs_dquot_t *dqp)
+{
+	int locked;
+
+	locked = mutex_trylock(&((dqp)->q_hash->qh_lock));
+	return (locked);
+}
+
+int
+xfs_qm_freelist_lock_nowait(
+	xfs_qm_t *xqm)
+{
+	int locked;
+
+	locked = mutex_trylock(&(xqm->qm_dqfreelist.qh_lock));
+	return (locked);
+}
+
+int
+xfs_qm_mplist_nowait(
+	xfs_mount_t	*mp)
+{
+	int locked;
+
+	ASSERT(mp->m_quotainfo);
+	locked = mutex_trylock(&(XFS_QI_MPLLOCK(mp)));
+	return (locked);
+}
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
new file mode 100644
index 000000000000..1c2c02530862
--- /dev/null
+++ b/fs/xfs/xfs_qm.h
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_QM_H__
+#define __XFS_QM_H__
+
+struct	xfs_dqhash;
+struct	xfs_inode;
+struct	xfs_dquot;
+
+extern kmem_zone_t	*qm_dqzone;
+extern kmem_zone_t	*qm_dqtrxzone;
+
+/*
+ * Used in xfs_qm_sync called by xfs_sync to count the max times that it can
+ * iterate over the mountpt's dquot list in one call.
+ */
+#define XFS_QM_SYNC_MAX_RESTARTS	7
+
+/*
+ * Ditto, for xfs_qm_dqreclaim_one.
+ */
+#define XFS_QM_RECLAIM_MAX_RESTARTS	4
+
+/*
+ * Ideal ratio of free to in use dquots. Quota manager makes an attempt
+ * to keep this balance.
+ */
+#define XFS_QM_DQFREE_RATIO		2
+
+/*
+ * Dquot hashtable constants/threshold values.
+ */
+#define XFS_QM_NCSIZE_THRESHOLD		5000
+#define XFS_QM_HASHSIZE_LOW		32
+#define XFS_QM_HASHSIZE_HIGH		64
+
+/*
+ * We output a cmn_err when quotachecking a quota file with more than
+ * this many fsbs.
+ */
+#define XFS_QM_BIG_QCHECK_NBLKS		500
+
+/*
+ * This defines the unit of allocation of dquots.
+ * Currently, it is just one file system block, and a 4K blk contains 30
+ * (136 * 30 = 4080) dquots. It's probably not worth trying to make
+ * this more dynamic.
+ * XXXsup However, if this number is changed, we have to make sure that we don't
+ * implicitly assume that we do allocations in chunks of a single filesystem
+ * block in the dquot/xqm code.
+ */
+#define XFS_DQUOT_CLUSTER_SIZE_FSB	(xfs_filblks_t)1
+/*
+ * When doing a quotacheck, we log dquot clusters of this many FSBs at most
+ * in a single transaction. We don't want to ask for too huge a log reservation.
+ */
+#define XFS_QM_MAX_DQCLUSTER_LOGSZ	3
+
+typedef xfs_dqhash_t	xfs_dqlist_t;
+/*
+ * The freelist head. The first two fields match the first two in the
+ * xfs_dquot_t structure (in xfs_dqmarker_t)
+ */
+typedef struct xfs_frlist {
+       struct xfs_dquot *qh_next;
+       struct xfs_dquot *qh_prev;
+       mutex_t		 qh_lock;
+       uint		 qh_version;
+       uint		 qh_nelems;
+} xfs_frlist_t;
+
+/*
+ * Quota Manager (global) structure. Lives only in core.
+ */
+typedef struct xfs_qm {
+	xfs_dqlist_t	*qm_usr_dqhtable;/* udquot hash table */
+	xfs_dqlist_t	*qm_grp_dqhtable;/* gdquot hash table */
+	uint		 qm_dqhashmask;	 /* # buckets in dq hashtab - 1 */
+	xfs_frlist_t	 qm_dqfreelist;	 /* freelist of dquots */
+	atomic_t	 qm_totaldquots; /* total incore dquots */
+	uint		 qm_nrefs;	 /* file systems with quota on */
+	int		 qm_dqfree_ratio;/* ratio of free to inuse dquots */
+	kmem_zone_t	*qm_dqzone;	 /* dquot mem-alloc zone */
+	kmem_zone_t	*qm_dqtrxzone;	 /* t_dqinfo of transactions */
+} xfs_qm_t;
+
+/*
+ * Various quota information for individual filesystems.
+ * The mount structure keeps a pointer to this.
+ */
+typedef struct xfs_quotainfo {
+	xfs_inode_t	*qi_uquotaip;	 /* user quota inode */
+	xfs_inode_t	*qi_gquotaip;	 /* group quota inode */
+	lock_t		 qi_pinlock;	 /* dquot pinning mutex */
+	xfs_dqlist_t	 qi_dqlist;	 /* all dquots in filesys */
+	int		 qi_dqreclaims;	 /* a change here indicates
+					    a removal in the dqlist */
+	time_t		 qi_btimelimit;	 /* limit for blks timer */
+	time_t		 qi_itimelimit;	 /* limit for inodes timer */
+	time_t		 qi_rtbtimelimit;/* limit for rt blks timer */
+	xfs_qwarncnt_t	 qi_bwarnlimit;	 /* limit for num warnings */
+	xfs_qwarncnt_t	 qi_iwarnlimit;	 /* limit for num warnings */
+	mutex_t		 qi_quotaofflock;/* to serialize quotaoff */
+	/* Some useful precalculated constants */
+	xfs_filblks_t	 qi_dqchunklen;	 /* # BBs in a chunk of dqs */
+	uint		 qi_dqperchunk;	 /* # ondisk dqs in above chunk */
+} xfs_quotainfo_t;
+
+
+/*
+ * The structure kept inside the xfs_trans_t keep track of dquot changes
+ * within a transaction and apply them later.
+ */
+typedef struct xfs_dqtrx {
+	struct xfs_dquot *qt_dquot;	  /* the dquot this refers to */
+	ulong		qt_blk_res;	  /* blks reserved on a dquot */
+	ulong		qt_blk_res_used;  /* blks used from the reservation */
+	ulong		qt_ino_res;	  /* inode reserved on a dquot */
+	ulong		qt_ino_res_used;  /* inodes used from the reservation */
+	long		qt_bcount_delta;  /* dquot blk count changes */
+	long		qt_delbcnt_delta; /* delayed dquot blk count changes */
+	long		qt_icount_delta;  /* dquot inode count changes */
+	ulong		qt_rtblk_res;	  /* # blks reserved on a dquot */
+	ulong		qt_rtblk_res_used;/* # blks used from reservation */
+	long		qt_rtbcount_delta;/* dquot realtime blk changes */
+	long		qt_delrtb_delta;  /* delayed RT blk count changes */
+} xfs_dqtrx_t;
+
+/*
+ * We keep the usr and grp dquots separately so that locking will be easier
+ * to do at commit time. All transactions that we know of at this point
+ * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value.
+ */
+#define XFS_QM_TRANS_MAXDQS		2
+typedef struct xfs_dquot_acct {
+	xfs_dqtrx_t	dqa_usrdquots[XFS_QM_TRANS_MAXDQS];
+	xfs_dqtrx_t	dqa_grpdquots[XFS_QM_TRANS_MAXDQS];
+} xfs_dquot_acct_t;
+
+/*
+ * Users are allowed to have a usage exceeding their softlimit for
+ * a period this long.
+ */
+#define XFS_QM_BTIMELIMIT	DQ_BTIMELIMIT
+#define XFS_QM_RTBTIMELIMIT	DQ_BTIMELIMIT
+#define XFS_QM_ITIMELIMIT	DQ_FTIMELIMIT
+
+#define XFS_QM_BWARNLIMIT	5
+#define XFS_QM_IWARNLIMIT	5
+
+#define XFS_QM_LOCK(xqm)	(mutex_lock(&xqm##_lock, PINOD))
+#define XFS_QM_UNLOCK(xqm)	(mutex_unlock(&xqm##_lock))
+#define XFS_QM_HOLD(xqm)	((xqm)->qm_nrefs++)
+#define XFS_QM_RELE(xqm)	((xqm)->qm_nrefs--)
+
+extern int		xfs_qm_init_quotainfo(xfs_mount_t *);
+extern void		xfs_qm_destroy_quotainfo(xfs_mount_t *);
+extern void		xfs_qm_dqunlink(xfs_dquot_t *);
+extern boolean_t	xfs_qm_dqalloc_incore(xfs_dquot_t **);
+extern int		xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
+
+/* list stuff */
+extern void		xfs_qm_freelist_init(xfs_frlist_t *);
+extern void		xfs_qm_freelist_destroy(xfs_frlist_t *);
+extern void		xfs_qm_freelist_insert(xfs_frlist_t *, xfs_dquot_t *);
+extern void		xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
+extern void		xfs_qm_freelist_unlink(xfs_dquot_t *);
+extern int		xfs_qm_freelist_lock_nowait(xfs_qm_t *);
+extern int		xfs_qm_mplist_nowait(xfs_mount_t *);
+extern int		xfs_qm_dqhashlock_nowait(xfs_dquot_t *);
+
+/* system call interface */
+extern int linvfs_getxstate(struct super_block *, struct fs_quota_stat *);
+extern int linvfs_setxstate(struct super_block *, unsigned int, int);
+extern int linvfs_getxquota(struct super_block *, int, qid_t, struct fs_disk_quota *);
+extern int linvfs_setxquota(struct super_block *, int, qid_t, struct fs_disk_quota *);
+
+#ifdef DEBUG
+extern int		xfs_qm_internalqcheck(xfs_mount_t *);
+#else
+#define xfs_qm_internalqcheck(mp)	(0)
+#endif
+
+#ifdef QUOTADEBUG
+extern void		xfs_qm_freelist_print(xfs_frlist_t *, char *);
+#else
+#define xfs_qm_freelist_print(a, b)	do { } while (0)
+#endif
+
+#endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
new file mode 100644
index 000000000000..73f22a296ec8
--- /dev/null
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -0,0 +1,1465 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <xfs_quota_priv.h>
+
+#ifdef DEBUG
+# define qdprintk(s, args...)		printk(s, ## args)
+#else
+# define qdprintk(s, args...)		do { } while (0)
+#endif
+
+STATIC int	xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
+STATIC int	xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
+					fs_disk_quota_t *);
+STATIC int	xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
+STATIC int	xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
+					fs_disk_quota_t *);
+STATIC int	xfs_qm_scall_quotaon(xfs_mount_t *, uint);
+STATIC int	xfs_qm_scall_quotaoff(xfs_mount_t *, uint, boolean_t);
+STATIC int	xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
+STATIC int	xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
+					uint);
+STATIC uint	xfs_qm_import_flags(uint);
+STATIC uint	xfs_qm_export_flags(uint);
+STATIC uint	xfs_qm_import_qtype_flags(uint);
+STATIC uint	xfs_qm_export_qtype_flags(uint);
+STATIC void	xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *,
+					fs_disk_quota_t *);
+
+
+int
+linvfs_getxstate(
+	struct super_block	*sb,
+	struct fs_quota_stat	*fqs)
+{
+	xfs_mount_t		*mp;
+	vfs_t			*vfsp;
+
+	vfsp = LINVFS_GET_VFS(sb);
+	mp = XFS_BHVTOM(vfsp->vfs_fbhv);
+	return -xfs_qm_scall_getqstat(mp, fqs);
+}
+
+int
+linvfs_setxstate(
+	struct super_block	*sb,
+	unsigned int		flags,
+	int			op)
+{
+	xfs_mount_t		*mp;
+	vfs_t			*vfsp;
+	uint			qflags;
+
+	vfsp = LINVFS_GET_VFS(sb);
+	mp = XFS_BHVTOM(vfsp->vfs_fbhv);
+	if (vfsp->vfs_flag & VFS_RDONLY)
+		return -EROFS;
+
+	switch (op) {
+	case Q_XQUOTARM:
+		if (XFS_IS_QUOTA_ON(mp)) {
+			qdprintk("cannot remove, quota on: flags=%x\n", flags);
+			return -EINVAL;
+		}
+		qflags = xfs_qm_import_qtype_flags(flags);
+		return -xfs_qm_scall_trunc_qfiles(mp, qflags);
+	case Q_XQUOTAON:
+		qflags = xfs_qm_import_flags(flags);
+		return -xfs_qm_scall_quotaon(mp, qflags);
+	case Q_XQUOTAOFF:
+		qflags = xfs_qm_import_flags(flags);
+		if (mp->m_dev == rootdev)
+			return -xfs_qm_scall_quotaoff(mp, qflags, B_FALSE);
+		if (!XFS_IS_QUOTA_ON(mp))
+			return -ESRCH;
+		return -xfs_qm_scall_quotaoff(mp, qflags, B_FALSE);
+	}
+	qdprintk("cannot set state, invalid op: op=%x flags=%x\n", op, flags);
+	return -EINVAL;
+}
+
+int
+linvfs_getxquota(
+	struct super_block	*sb,
+	int			type,
+	qid_t			id,
+	struct fs_disk_quota	*fdq)
+{
+	xfs_mount_t		*mp;
+	vfs_t			*vfsp;
+	int			qtype;
+
+	vfsp = LINVFS_GET_VFS(sb);
+	mp = XFS_BHVTOM(vfsp->vfs_fbhv);
+	if (!XFS_IS_QUOTA_ON(mp))
+		return -ESRCH;
+	qtype = (type == GRPQUOTA)? XFS_DQ_GROUP : XFS_DQ_USER;
+	return -xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, qtype, fdq);
+}
+
+int
+linvfs_setxquota(
+	struct super_block	*sb,
+	int			type,
+	qid_t			id,
+	struct fs_disk_quota	*fdq)
+{
+	xfs_mount_t		*mp;
+	vfs_t			*vfsp;
+	int			qtype;
+
+	vfsp = LINVFS_GET_VFS(sb);
+	mp = XFS_BHVTOM(vfsp->vfs_fbhv);
+	if (!XFS_IS_QUOTA_ON(mp))
+		return -ESRCH;
+	if (vfsp->vfs_flag & VFS_RDONLY)
+		return -EROFS;
+	qtype = (type == GRPQUOTA)? XFS_DQ_GROUP : XFS_DQ_USER;
+	return xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, qtype, fdq);
+}
+
+
+/*
+ * Turn off quota accounting and/or enforcement for all udquots and/or
+ * gdquots. Called only at unmount time.
+ *
+ * This assumes that there are no dquots of this file system cached
+ * incore, and modifies the ondisk dquot directly. Therefore, for example,
+ * it is an error to call this twice, without purging the cache.
+ */
+STATIC int
+xfs_qm_scall_quotaoff(
+	xfs_mount_t		*mp,
+	uint			flags,
+	boolean_t		force)
+{
+	uint			dqtype;
+	unsigned long	s;
+	int			error;
+	uint			inactivate_flags;
+	xfs_qoff_logitem_t	*qoffstart;
+	uint			sbflags, newflags;
+	int			nculprits;
+
+	if (!force && !capable(CAP_SYS_ADMIN))
+		return XFS_ERROR(EPERM);
+	/*
+	 * Only root file system can have quotas enabled on disk but not
+	 * in core. Note that quota utilities (like quotaoff) _expect_
+	 * errno == EEXIST here.
+	 */
+	if (mp->m_dev != rootdev && (mp->m_qflags & flags) == 0)
+		return XFS_ERROR(EEXIST);
+	error = 0;
+
+	flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
+
+	/*
+	 * We don't want to deal with two quotaoffs messing up each other,
+	 * so we're going to serialize it. quotaoff isn't exactly a performance
+	 * critical thing.
+	 * If quotaoff, then we must be dealing with the root filesystem.
+	 */
+	ASSERT(mp->m_quotainfo || mp->m_dev == rootdev);
+	if (mp->m_quotainfo)
+		mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD);
+
+	/*
+	 * Root file system may or may not have quotas on in core.
+	 * We have to perform the quotaoff accordingly.
+	 */
+	if (mp->m_dev == rootdev) {
+		s = XFS_SB_LOCK(mp);
+		sbflags = mp->m_sb.sb_qflags;
+		if ((mp->m_qflags & flags) == 0) {
+			mp->m_sb.sb_qflags &= ~(flags);
+			newflags = mp->m_sb.sb_qflags;
+			XFS_SB_UNLOCK(mp, s);
+			if (mp->m_quotainfo)
+				mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+			if (sbflags != newflags)
+			      error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
+			return (error);
+		}
+		XFS_SB_UNLOCK(mp, s);
+
+		if ((sbflags & flags) != (mp->m_qflags & flags)) {
+			/*
+			 * This can happen only with grp+usr quota
+			 * combination. Note: 1) accounting cannot be turned
+			 * off without enforcement also getting turned off.
+			 * 2) Every flag that exists in mpqflags MUST exist
+			 * in sbqflags (but not vice versa).
+			 * which means at this point sbqflags = UQ+GQ+..,
+			 * and mpqflags = UQ or GQ.
+			 */
+			ASSERT(sbflags & XFS_GQUOTA_ACCT);
+			ASSERT((sbflags & XFS_ALL_QUOTA_ACCT) !=
+			       (mp->m_qflags & XFS_ALL_QUOTA_ACCT));
+
+			qdprintk("quotaoff, sbflags=%x flags=%x m_qflags=%x\n",
+				sbflags, flags, mp->m_qflags);
+			/* XXX TBD Finish this for group quota support */
+			/* We need to update the SB and mp separately */
+			return XFS_ERROR(EINVAL);
+		}
+	}
+	ASSERT(mp->m_quotainfo);
+
+	/*
+	 * if we're just turning off quota enforcement, change mp and go.
+	 */
+	if ((flags & XFS_ALL_QUOTA_ACCT) == 0) {
+		mp->m_qflags &= ~(flags);
+
+		s = XFS_SB_LOCK(mp);
+		mp->m_sb.sb_qflags = mp->m_qflags;
+		XFS_SB_UNLOCK(mp, s);
+		mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+
+		/* XXX what to do if error ? Revert back to old vals incore ? */
+		error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
+		return (error);
+	}
+
+	dqtype = 0;
+	inactivate_flags = 0;
+	/*
+	 * If accounting is off, we must turn enforcement off, clear the
+	 * quota 'CHKD' certificate to make it known that we have to
+	 * do a quotacheck the next time this quota is turned on.
+	 */
+	if (flags & XFS_UQUOTA_ACCT) {
+		dqtype |= XFS_QMOPT_UQUOTA;
+		flags |= (XFS_UQUOTA_CHKD | XFS_UQUOTA_ENFD);
+		inactivate_flags |= XFS_UQUOTA_ACTIVE;
+	}
+	if (flags & XFS_GQUOTA_ACCT) {
+		dqtype |= XFS_QMOPT_GQUOTA;
+		flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD);
+		inactivate_flags |= XFS_GQUOTA_ACTIVE;
+	}
+
+	/*
+	 * Nothing to do? Don't complain.
+	 * This happens when we're just turning off quota enforcement.
+	 */
+	if ((mp->m_qflags & flags) == 0) {
+		mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+		return (0);
+	}
+
+	/*
+	 * Write the LI_QUOTAOFF log record, and do SB changes atomically,
+	 * and synchronously.
+	 */
+	xfs_qm_log_quotaoff(mp, &qoffstart, flags);
+
+	/*
+	 * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
+	 * to take care of the race between dqget and quotaoff. We don't take
+	 * any special locks to reset these bits. All processes need to check
+	 * these bits *after* taking inode lock(s) to see if the particular
+	 * quota type is in the process of being turned off. If *ACTIVE, it is
+	 * guaranteed that all dquot structures and all quotainode ptrs will all
+	 * stay valid as long as that inode is kept locked.
+	 *
+	 * There is no turning back after this.
+	 */
+	mp->m_qflags &= ~inactivate_flags;
+
+	/*
+	 * Give back all the dquot reference(s) held by inodes.
+	 * Here we go thru every single incore inode in this file system, and
+	 * do a dqrele on the i_udquot/i_gdquot that it may have.
+	 * Essentially, as long as somebody has an inode locked, this guarantees
+	 * that quotas will not be turned off. This is handy because in a
+	 * transaction once we lock the inode(s) and check for quotaon, we can
+	 * depend on the quota inodes (and other things) being valid as long as
+	 * we keep the lock(s).
+	 */
+	xfs_qm_dqrele_all_inodes(mp, flags);
+
+	/*
+	 * Next we make the changes in the quota flag in the mount struct.
+	 * This isn't protected by a particular lock directly, because we
+	 * don't want to take a mrlock everytime we depend on quotas being on.
+	 */
+	mp->m_qflags &= ~(flags);
+
+	/*
+	 * Go through all the dquots of this file system and purge them,
+	 * according to what was turned off. We may not be able to get rid
+	 * of all dquots, because dquots can have temporary references that
+	 * are not attached to inodes. eg. xfs_setattr, xfs_create.
+	 * So, if we couldn't purge all the dquots from the filesystem,
+	 * we can't get rid of the incore data structures.
+	 */
+	while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype|XFS_QMOPT_QUOTAOFF))) {
+		delay(10 * nculprits);
+	}
+
+	/*
+	 * Transactions that had started before ACTIVE state bit was cleared
+	 * could have logged many dquots, so they'd have higher LSNs than
+	 * the first QUOTAOFF log record does. If we happen to crash when
+	 * the tail of the log has gone past the QUOTAOFF record, but
+	 * before the last dquot modification, those dquots __will__
+	 * recover, and that's not good.
+	 *
+	 * So, we have QUOTAOFF start and end logitems; the start
+	 * logitem won't get overwritten until the end logitem appears...
+	 */
+	xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
+
+	/*
+	 * If quotas is completely disabled, close shop.
+	 */
+	if ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_ALL) {
+		mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+		xfs_qm_destroy_quotainfo(mp);
+		return (0);
+	}
+
+	/*
+	 * Release our quotainode references, and vn_purge them,
+	 * if we don't need them anymore.
+	 */
+	if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) {
+		XFS_PURGE_INODE(XFS_QI_UQIP(mp));
+		XFS_QI_UQIP(mp) = NULL;
+	}
+	if ((dqtype & XFS_QMOPT_GQUOTA) && XFS_QI_GQIP(mp)) {
+		XFS_PURGE_INODE(XFS_QI_GQIP(mp));
+		XFS_QI_GQIP(mp) = NULL;
+	}
+	mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+
+	return (error);
+}
+
+STATIC int
+xfs_qm_scall_trunc_qfiles(
+	xfs_mount_t	*mp,
+	uint		flags)
+{
+	int		error;
+	xfs_inode_t	*qip;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return XFS_ERROR(EPERM);
+	error = 0;
+	if (!XFS_SB_VERSION_HASQUOTA(&mp->m_sb) || flags == 0) {
+		qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
+		return XFS_ERROR(EINVAL);
+	}
+
+	if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) {
+		error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, &qip, 0);
+		if (! error) {
+			(void) xfs_truncate_file(mp, qip);
+			VN_RELE(XFS_ITOV(qip));
+		}
+	}
+
+	if ((flags & XFS_DQ_GROUP) && mp->m_sb.sb_gquotino != NULLFSINO) {
+		error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, &qip, 0);
+		if (! error) {
+			(void) xfs_truncate_file(mp, qip);
+			VN_RELE(XFS_ITOV(qip));
+		}
+	}
+
+	return (error);
+}
+
+
+/*
+ * This does two separate functions:
+ * Switch on quotas for the root file system. This will take effect only
+ * on reboot.
+ * Switch on (a given) quota enforcement for both root and non-root filesystems.
+ * This takes effect immediately.
+ */
+STATIC int
+xfs_qm_scall_quotaon(
+	xfs_mount_t	*mp,
+	uint		flags)
+{
+	int		error;
+	unsigned long s;
+	uint		qf;
+	uint		accflags;
+	__int64_t	sbflags;
+	boolean_t	rootfs;
+	boolean_t	delay;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return XFS_ERROR(EPERM);
+
+	rootfs = (boolean_t) (mp->m_dev == rootdev);
+
+	flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
+	/*
+	 * If caller wants to turn on accounting on /, but accounting
+	 * is already turned on, ignore ACCTing flags.
+	 * Switching on quota accounting for non-root filesystems
+	 * must be done at mount time.
+	 */
+	accflags = flags & XFS_ALL_QUOTA_ACCT;
+	if (!rootfs ||
+	    (accflags && rootfs && ((mp->m_qflags & accflags) == accflags))) {
+		flags &= ~(XFS_ALL_QUOTA_ACCT);
+	}
+
+	sbflags = 0;
+	delay = (boolean_t) ((flags & XFS_ALL_QUOTA_ACCT) != 0);
+
+	if (flags == 0) {
+		qdprintk("quotaon: zero flags, m_qflags=%x\n", mp->m_qflags);
+		return XFS_ERROR(EINVAL);
+	}
+
+	/* Only rootfs can turn on quotas with a delayed effect */
+	ASSERT(!delay || rootfs);
+
+	/*
+	 * Can't enforce without accounting. We check the superblock
+	 * qflags here instead of m_qflags because rootfs can have
+	 * quota acct on ondisk without m_qflags' knowing.
+	 */
+	if (((flags & XFS_UQUOTA_ACCT) == 0 &&
+	    (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
+	    (flags & XFS_UQUOTA_ENFD))
+	    ||
+	    ((flags & XFS_GQUOTA_ACCT) == 0 &&
+	    (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
+	    (flags & XFS_GQUOTA_ENFD))) {
+		qdprintk("Can't enforce without acct, flags=%x sbflags=%x\n",
+			flags, mp->m_sb.sb_qflags);
+		return XFS_ERROR(EINVAL);
+	}
+	/*
+	 * If everything's upto-date incore, then don't waste time.
+	 */
+	if ((mp->m_qflags & flags) == flags)
+		return XFS_ERROR(EEXIST);
+
+	/*
+	 * Change superblock version (if needed) for the root filesystem
+	 */
+	if (rootfs && !XFS_SB_VERSION_HASQUOTA(&mp->m_sb)) {
+		qdprintk("Old superblock version %x\n", mp->m_sb.sb_versionnum);
+		s = XFS_SB_LOCK(mp);
+		XFS_SB_VERSION_ADDQUOTA(&mp->m_sb);
+		mp->m_sb.sb_uquotino = NULLFSINO;
+		mp->m_sb.sb_gquotino = NULLFSINO;
+		mp->m_sb.sb_qflags = 0;
+		XFS_SB_UNLOCK(mp, s);
+		qdprintk("Converted to version %x\n", mp->m_sb.sb_versionnum);
+		sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+			    XFS_SB_GQUOTINO | XFS_SB_QFLAGS);
+	}
+
+	/*
+	 * Change sb_qflags on disk but not incore mp->qflags
+	 * if this is the root filesystem.
+	 */
+	s = XFS_SB_LOCK(mp);
+	qf = mp->m_sb.sb_qflags;
+	mp->m_sb.sb_qflags = qf | flags;
+	XFS_SB_UNLOCK(mp, s);
+
+	/*
+	 * There's nothing to change if it's the same.
+	 */
+	if ((qf & flags) == flags && sbflags == 0)
+		return XFS_ERROR(EEXIST);
+	sbflags |= XFS_SB_QFLAGS;
+
+	if ((error = xfs_qm_write_sb_changes(mp, sbflags)))
+		return (error);
+	/*
+	 * If we had just turned on quotas (ondisk) for rootfs, or if we aren't
+	 * trying to switch on quota enforcement, we are done.
+	 */
+	if (delay ||
+	    ((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) !=
+	     (mp->m_qflags & XFS_UQUOTA_ACCT)) ||
+	    (flags & XFS_ALL_QUOTA_ENFD) == 0)
+		return (0);
+
+	if (! XFS_IS_QUOTA_RUNNING(mp))
+		return XFS_ERROR(ESRCH);
+
+	/*
+	 * Switch on quota enforcement in core. This applies to both root
+	 * and non-root file systems.
+	 */
+	mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD);
+	mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
+	mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+
+	return (0);
+}
+
+
+
+/*
+ * Return quota status information, such as uquota-off, enforcements, etc.
+ */
+STATIC int
+xfs_qm_scall_getqstat(
+	xfs_mount_t	*mp,
+	fs_quota_stat_t *out)
+{
+	xfs_inode_t	*uip, *gip;
+	boolean_t	tempuqip, tempgqip;
+	__uint16_t	sbflags;
+
+	uip = gip = NULL;
+	tempuqip = tempgqip = B_FALSE;
+	bzero(out, sizeof(fs_quota_stat_t));
+
+	out->qs_version = FS_QSTAT_VERSION;
+	if (! XFS_SB_VERSION_HASQUOTA(&mp->m_sb)) {
+		out->qs_uquota.qfs_ino = NULLFSINO;
+		out->qs_gquota.qfs_ino = NULLFSINO;
+		return (0);
+	}
+	out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
+							(XFS_ALL_QUOTA_ACCT|
+							 XFS_ALL_QUOTA_ENFD));
+	/*
+	 * If the qflags are different on disk, as can be the case when
+	 * root filesystem's quotas are being turned on, return them in the
+	 * HI 8 bits.
+	 */
+	if (mp->m_dev == rootdev) {
+		sbflags = (__uint16_t) xfs_qm_export_flags(mp->m_sb.sb_qflags &
+							   (XFS_ALL_QUOTA_ACCT|
+							    XFS_ALL_QUOTA_ENFD));
+		ASSERT((out->qs_flags & 0xff00) == 0);
+		if (sbflags != out->qs_flags)
+			out->qs_flags |= ((sbflags & 0x00ff) << 8);
+	}
+
+	out->qs_pad = 0;
+	out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
+	out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
+	if (mp->m_quotainfo) {
+		uip = mp->m_quotainfo->qi_uquotaip;
+		gip = mp->m_quotainfo->qi_gquotaip;
+	}
+	if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
+		if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, &uip, 0) == 0)
+			tempuqip = B_TRUE;
+	}
+	if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
+		if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, &gip, 0) == 0)
+			tempgqip = B_TRUE;
+	}
+	if (uip) {
+		out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
+		out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
+		if (tempuqip)
+			VN_RELE(XFS_ITOV(uip));
+	}
+	if (gip) {
+		out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
+		out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
+		if (tempgqip)
+			VN_RELE(XFS_ITOV(gip));
+	}
+	if (mp->m_quotainfo) {
+		out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp);
+		out->qs_btimelimit = XFS_QI_BTIMELIMIT(mp);
+		out->qs_itimelimit = XFS_QI_ITIMELIMIT(mp);
+		out->qs_rtbtimelimit = XFS_QI_RTBTIMELIMIT(mp);
+		out->qs_bwarnlimit = XFS_QI_BWARNLIMIT(mp);
+		out->qs_iwarnlimit = XFS_QI_IWARNLIMIT(mp);
+	}
+	return (0);
+}
+
+/*
+ * Adjust quota limits, and start/stop timers accordingly.
+ */
+STATIC int
+xfs_qm_scall_setqlim(
+	xfs_mount_t		*mp,
+	xfs_dqid_t		id,
+	uint			type,
+	fs_disk_quota_t		*newlim)
+{
+	xfs_disk_dquot_t	*ddq;
+	xfs_dquot_t		*dqp;
+	xfs_trans_t		*tp;
+	int			error;
+	xfs_qcnt_t		hard, soft;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return XFS_ERROR(EPERM);
+
+	if ((newlim->d_fieldmask & (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK)) == 0)
+		return (0);
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
+	if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128,
+				      0, 0, XFS_DEFAULT_LOG_COUNT))) {
+		xfs_trans_cancel(tp, 0);
+		return (error);
+	}
+
+	/*
+	 * We don't want to race with a quotaoff so take the quotaoff lock.
+	 * (We don't hold an inode lock, so there's nothing else to stop
+	 * a quotaoff from happening). (XXXThis doesn't currently happen
+	 * because we take the vfslock before calling xfs_qm_sysent).
+	 */
+	mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD);
+
+	/*
+	 * Get the dquot (locked), and join it to the transaction.
+	 * Allocate the dquot if this doesn't exist.
+	 */
+	if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
+		xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+		mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+		ASSERT(error != ENOENT);
+		return (error);
+	}
+	xfs_dqtrace_entry(dqp, "Q_SETQLIM: AFT DQGET");
+	xfs_trans_dqjoin(tp, dqp);
+	ddq = &dqp->q_core;
+
+	/*
+	 * Make sure that hardlimits are >= soft limits before changing.
+	 */
+	hard = (newlim->d_fieldmask & FS_DQ_BHARD) ?
+		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_hardlimit) :
+			INT_GET(ddq->d_blk_hardlimit, ARCH_CONVERT);
+	soft = (newlim->d_fieldmask & FS_DQ_BSOFT) ?
+		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_softlimit) :
+			INT_GET(ddq->d_blk_softlimit, ARCH_CONVERT);
+	if (hard == 0 || hard >= soft) {
+		INT_SET(ddq->d_blk_hardlimit, ARCH_CONVERT, hard);
+		INT_SET(ddq->d_blk_softlimit, ARCH_CONVERT, soft);
+	}
+	else {
+		qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft);
+	}
+	hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
+		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
+			INT_GET(ddq->d_rtb_hardlimit, ARCH_CONVERT);
+	soft = (newlim->d_fieldmask & FS_DQ_RTBSOFT) ?
+		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_softlimit) :
+			INT_GET(ddq->d_rtb_softlimit, ARCH_CONVERT);
+	if (hard == 0 || hard >= soft) {
+		INT_SET(ddq->d_rtb_hardlimit, ARCH_CONVERT, hard);
+		INT_SET(ddq->d_rtb_softlimit, ARCH_CONVERT, soft);
+	}
+	else
+		qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
+
+	hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
+		(xfs_qcnt_t) newlim->d_ino_hardlimit :
+		INT_GET(ddq->d_ino_hardlimit, ARCH_CONVERT);
+	soft = (newlim->d_fieldmask & FS_DQ_ISOFT) ?
+		(xfs_qcnt_t) newlim->d_ino_softlimit :
+		INT_GET(ddq->d_ino_softlimit, ARCH_CONVERT);
+	if (hard == 0 || hard >= soft) {
+		INT_SET(ddq->d_ino_hardlimit, ARCH_CONVERT, hard);
+		INT_SET(ddq->d_ino_softlimit, ARCH_CONVERT, soft);
+	}
+	else
+		qdprintk("ihard %Ld < isoft %Ld\n", hard, soft);
+
+	if (id == 0) {
+		/*
+		 * Timelimits for the super user set the relative time
+		 * the other users can be over quota for this file system.
+		 * If it is zero a default is used.
+		 */
+		if (newlim->d_fieldmask & FS_DQ_BTIMER) {
+			mp->m_quotainfo->qi_btimelimit = newlim->d_btimer;
+			INT_SET(dqp->q_core.d_btimer, ARCH_CONVERT, newlim->d_btimer);
+		}
+		if (newlim->d_fieldmask & FS_DQ_ITIMER) {
+			mp->m_quotainfo->qi_itimelimit = newlim->d_itimer;
+			INT_SET(dqp->q_core.d_itimer, ARCH_CONVERT, newlim->d_itimer);
+		}
+		if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
+			mp->m_quotainfo->qi_rtbtimelimit = newlim->d_rtbtimer;
+			INT_SET(dqp->q_core.d_rtbtimer, ARCH_CONVERT, newlim->d_rtbtimer);
+		}
+	} else /* if (XFS_IS_QUOTA_ENFORCED(mp)) */ {
+		/*
+		 * If the user is now over quota, start the timelimit.
+		 * The user will not be 'warned'.
+		 * Note that we keep the timers ticking, whether enforcement
+		 * is on or off. We don't really want to bother with iterating
+		 * over all ondisk dquots and turning the timers on/off.
+		 */
+		xfs_qm_adjust_dqtimers(mp, ddq);
+	}
+	dqp->dq_flags |= XFS_DQ_DIRTY;
+	xfs_trans_log_dquot(tp, dqp);
+
+	xfs_dqtrace_entry(dqp, "Q_SETQLIM: COMMIT");
+	xfs_trans_commit(tp, 0, NULL);
+	xfs_qm_dqprint(dqp);
+	xfs_qm_dqrele(dqp);
+	mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
+
+	return (0);
+}
+
+STATIC int
+xfs_qm_scall_getquota(
+	xfs_mount_t	*mp,
+	xfs_dqid_t	id,
+	uint		type,
+	fs_disk_quota_t *out)
+{
+	xfs_dquot_t	*dqp;
+	int		error;
+
+	/*
+	 * Try to get the dquot. We don't want it allocated on disk, so
+	 * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
+	 * exist, we'll get ENOENT back.
+	 */
+	if ((error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp))) {
+		return (error);
+	}
+
+	xfs_dqtrace_entry(dqp, "Q_GETQUOTA SUCCESS");
+	/*
+	 * If everything's NULL, this dquot doesn't quite exist as far as
+	 * our utility programs are concerned.
+	 */
+	if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+		xfs_qm_dqput(dqp);
+		return XFS_ERROR(ENOENT);
+	}
+	/* xfs_qm_dqprint(dqp); */
+	/*
+	 * Convert the disk dquot to the exportable format
+	 */
+	xfs_qm_export_dquot(mp, &dqp->q_core, out);
+	xfs_qm_dqput(dqp);
+	return (error ? XFS_ERROR(EFAULT) : 0);
+}
+
+
+STATIC int
+xfs_qm_log_quotaoff_end(
+	xfs_mount_t		*mp,
+	xfs_qoff_logitem_t	*startqoff,
+	uint			flags)
+{
+	xfs_trans_t	       *tp;
+	int			error;
+	xfs_qoff_logitem_t     *qoffi;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
+
+	if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_qoff_logitem_t) * 2,
+				      0, 0, XFS_DEFAULT_LOG_COUNT))) {
+		xfs_trans_cancel(tp, 0);
+		return (error);
+	}
+
+	qoffi = xfs_trans_get_qoff_item(tp, startqoff,
+					flags & XFS_ALL_QUOTA_ACCT);
+	xfs_trans_log_quotaoff_item(tp, qoffi);
+
+	/*
+	 * We have to make sure that the transaction is secure on disk before we
+	 * return and actually stop quota accounting. So, make it synchronous.
+	 * We don't care about quotoff's performance.
+	 */
+	xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp, 0, NULL);
+	return (error);
+}
+
+
+STATIC int
+xfs_qm_log_quotaoff(
+	xfs_mount_t	       *mp,
+	xfs_qoff_logitem_t     **qoffstartp,
+	uint		       flags)
+{
+	xfs_trans_t	       *tp;
+	int			error;
+	unsigned long	s;
+	xfs_qoff_logitem_t     *qoffi=NULL;
+	uint			oldsbqflag=0;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
+	if ((error = xfs_trans_reserve(tp, 0,
+				      sizeof(xfs_qoff_logitem_t) * 2 +
+				      mp->m_sb.sb_sectsize + 128,
+				      0,
+				      0,
+				      XFS_DEFAULT_LOG_COUNT))) {
+		goto error0;
+	}
+
+	qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
+	xfs_trans_log_quotaoff_item(tp, qoffi);
+
+	s = XFS_SB_LOCK(mp);
+	oldsbqflag = mp->m_sb.sb_qflags;
+	mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
+	XFS_SB_UNLOCK(mp, s);
+
+	xfs_mod_sb(tp, XFS_SB_QFLAGS);
+
+	/*
+	 * We have to make sure that the transaction is secure on disk before we
+	 * return and actually stop quota accounting. So, make it synchronous.
+	 * We don't care about quotoff's performance.
+	 */
+	xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp, 0, NULL);
+
+error0:
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		/*
+		 * No one else is modifying sb_qflags, so this is OK.
+		 * We still hold the quotaofflock.
+		 */
+		s = XFS_SB_LOCK(mp);
+		mp->m_sb.sb_qflags = oldsbqflag;
+		XFS_SB_UNLOCK(mp, s);
+	}
+	*qoffstartp = qoffi;
+	return (error);
+}
+
+
+/*
+ * Translate an internal style on-disk-dquot to the exportable format.
+ * The main differences are that the counters/limits are all in Basic
+ * Blocks (BBs) instead of the internal FSBs, and all on-disk data has
+ * to be converted to the native endianness.
+ */
+STATIC void
+xfs_qm_export_dquot(
+	xfs_mount_t		*mp,
+	xfs_disk_dquot_t	*src,
+	struct fs_disk_quota	*dst)
+{
+	bzero(dst, sizeof(*dst));
+	dst->d_version = FS_DQUOT_VERSION;  /* different from src->d_version */
+	dst->d_flags =
+		xfs_qm_export_qtype_flags(INT_GET(src->d_flags, ARCH_CONVERT));
+	dst->d_id = INT_GET(src->d_id, ARCH_CONVERT);
+	dst->d_blk_hardlimit = (__uint64_t)
+		XFS_FSB_TO_BB(mp, INT_GET(src->d_blk_hardlimit, ARCH_CONVERT));
+	dst->d_blk_softlimit = (__uint64_t)
+		XFS_FSB_TO_BB(mp, INT_GET(src->d_blk_softlimit, ARCH_CONVERT));
+	dst->d_ino_hardlimit = (__uint64_t)
+		INT_GET(src->d_ino_hardlimit, ARCH_CONVERT);
+	dst->d_ino_softlimit = (__uint64_t)
+		INT_GET(src->d_ino_softlimit, ARCH_CONVERT);
+	dst->d_bcount = (__uint64_t)
+		XFS_FSB_TO_BB(mp, INT_GET(src->d_bcount, ARCH_CONVERT));
+	dst->d_icount = (__uint64_t) INT_GET(src->d_icount, ARCH_CONVERT);
+	dst->d_btimer = (__uint32_t) INT_GET(src->d_btimer, ARCH_CONVERT);
+	dst->d_itimer = (__uint32_t) INT_GET(src->d_itimer, ARCH_CONVERT);
+	dst->d_iwarns = INT_GET(src->d_iwarns, ARCH_CONVERT);
+	dst->d_bwarns = INT_GET(src->d_bwarns, ARCH_CONVERT);
+
+	dst->d_rtb_hardlimit = (__uint64_t)
+		XFS_FSB_TO_BB(mp, INT_GET(src->d_rtb_hardlimit, ARCH_CONVERT));
+	dst->d_rtb_softlimit = (__uint64_t)
+		XFS_FSB_TO_BB(mp, INT_GET(src->d_rtb_softlimit, ARCH_CONVERT));
+	dst->d_rtbcount = (__uint64_t)
+		XFS_FSB_TO_BB(mp, INT_GET(src->d_rtbcount, ARCH_CONVERT));
+	dst->d_rtbtimer = (__uint32_t) INT_GET(src->d_rtbtimer, ARCH_CONVERT);
+	dst->d_rtbwarns = INT_GET(src->d_rtbwarns, ARCH_CONVERT);
+
+	/*
+	 * Internally, we don't reset all the timers when quota enforcement
+	 * gets turned off. No need to confuse the userlevel code,
+	 * so return zeroes in that case.
+	 */
+	if (! XFS_IS_QUOTA_ENFORCED(mp)) {
+		dst->d_btimer = 0;
+		dst->d_itimer = 0;
+		dst->d_rtbtimer = 0;
+	}
+
+#ifdef DEBUG
+	if (XFS_IS_QUOTA_ENFORCED(mp) && dst->d_id != 0) {
+		if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
+		    (dst->d_blk_softlimit > 0)) {
+			ASSERT(dst->d_btimer != 0);
+		}
+		if (((int) dst->d_icount >= (int) dst->d_ino_softlimit) &&
+		    (dst->d_ino_softlimit > 0)) {
+			ASSERT(dst->d_itimer != 0);
+		}
+	}
+#endif
+}
+
+STATIC uint
+xfs_qm_import_qtype_flags(
+	uint uflags)
+{
+	/*
+	 * Can't be both at the same time.
+	 */
+	if (((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ==
+	     (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) ||
+	    ((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) == 0))
+		return (0);
+
+	return (uflags & XFS_USER_QUOTA) ?
+		XFS_DQ_USER : XFS_DQ_GROUP;
+}
+
+STATIC uint
+xfs_qm_export_qtype_flags(
+	uint flags)
+{
+	/*
+	 * Can't be both at the same time.
+	 */
+	ASSERT((flags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) !=
+		(XFS_GROUP_QUOTA | XFS_USER_QUOTA));
+	ASSERT((flags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) != 0);
+
+	return (flags & XFS_DQ_USER) ?
+		XFS_USER_QUOTA : XFS_GROUP_QUOTA;
+}
+
+STATIC uint
+xfs_qm_import_flags(
+	uint uflags)
+{
+	uint flags = 0;
+
+	if (uflags & XFS_QUOTA_UDQ_ACCT)
+		flags |= XFS_UQUOTA_ACCT;
+	if (uflags & XFS_QUOTA_GDQ_ACCT)
+		flags |= XFS_GQUOTA_ACCT;
+	if (uflags & XFS_QUOTA_UDQ_ENFD)
+		flags |= XFS_UQUOTA_ENFD;
+	if (uflags & XFS_QUOTA_GDQ_ENFD)
+		flags |= XFS_GQUOTA_ENFD;
+	return (flags);
+}
+
+
+STATIC uint
+xfs_qm_export_flags(
+	uint flags)
+{
+	uint uflags;
+
+	uflags = 0;
+	if (flags & XFS_UQUOTA_ACCT)
+		uflags |= XFS_QUOTA_UDQ_ACCT;
+	if (flags & XFS_GQUOTA_ACCT)
+		uflags |= XFS_QUOTA_GDQ_ACCT;
+	if (flags & XFS_UQUOTA_ENFD)
+		uflags |= XFS_QUOTA_UDQ_ENFD;
+	if (flags & XFS_GQUOTA_ENFD)
+		uflags |= XFS_QUOTA_GDQ_ENFD;
+	return (uflags);
+}
+
+
+/*
+ * Go thru all the inodes in the file system, releasing their dquots.
+ * Note that the mount structure gets modified to indicate that quotas are off
+ * AFTER this, in the case of quotaoff. This also gets called from
+ * xfs_rootumount.
+ */
+void
+xfs_qm_dqrele_all_inodes(
+	struct xfs_mount *mp,
+	uint		 flags)
+{
+	vmap_t		vmap;
+	xfs_inode_t	*ip, *topino;
+	uint		ireclaims;
+	vnode_t		*vp;
+	boolean_t	vnode_refd;
+
+	ASSERT(mp->m_quotainfo);
+
+again:
+	XFS_MOUNT_ILOCK(mp);
+	ip = mp->m_inodes;
+	if (ip == NULL) {
+		XFS_MOUNT_IUNLOCK(mp);
+		return;
+	}
+	do {
+		/* Skip markers inserted by xfs_sync */
+		if (ip->i_mount == NULL) {
+			ip = ip->i_mnext;
+			continue;
+		}
+		/* Root inode, rbmip and rsumip have associated blocks */
+		if (ip == XFS_QI_UQIP(mp) || ip == XFS_QI_GQIP(mp)) {
+			ASSERT(ip->i_udquot == NULL);
+			ASSERT(ip->i_gdquot == NULL);
+			ip = ip->i_mnext;
+			continue;
+		}
+		vp = XFS_ITOV_NULL(ip);
+		if (!vp) {
+			ASSERT(ip->i_udquot == NULL);
+			ASSERT(ip->i_gdquot == NULL);
+			ip = ip->i_mnext;
+			continue;
+		}
+		vnode_refd = B_FALSE;
+		if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
+			/*
+			 * Sample vp mapping while holding the mplock, lest
+			 * we come across a non-existent vnode.
+			 */
+			VMAP(vp, ip, vmap);
+			ireclaims = mp->m_ireclaims;
+			topino = mp->m_inodes;
+			XFS_MOUNT_IUNLOCK(mp);
+
+			/* XXX restart limit ? */
+			if ( ! (vp = vn_get(vp, &vmap)))
+				goto again;
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+			vnode_refd = B_TRUE;
+		} else {
+			ireclaims = mp->m_ireclaims;
+			topino = mp->m_inodes;
+			XFS_MOUNT_IUNLOCK(mp);
+		}
+
+		/*
+		 * We don't keep the mountlock across the dqrele() call,
+		 * since it can take a while..
+		 */
+		if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
+			xfs_qm_dqrele(ip->i_udquot);
+			ip->i_udquot = NULL;
+		}
+		if ((flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) {
+			xfs_qm_dqrele(ip->i_gdquot);
+			ip->i_gdquot = NULL;
+		}
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		/*
+		 * Wait until we've dropped the ilock and mountlock to
+		 * do the vn_rele. Or be condemned to an eternity in the
+		 * inactive code in hell.
+		 */
+		if (vnode_refd)
+			VN_RELE(vp);
+		XFS_MOUNT_ILOCK(mp);
+		/*
+		 * If an inode was inserted or removed, we gotta
+		 * start over again.
+		 */
+		if (topino != mp->m_inodes || mp->m_ireclaims != ireclaims) {
+			/* XXX use a sentinel */
+			XFS_MOUNT_IUNLOCK(mp);
+			goto again;
+		}
+		ip = ip->i_mnext;
+	} while (ip != mp->m_inodes);
+
+	XFS_MOUNT_IUNLOCK(mp);
+}
+
+/*------------------------------------------------------------------------*/
+#ifdef DEBUG
+/*
+ * This contains all the test functions for XFS disk quotas.
+ * Currently it does a quota accounting check. ie. it walks through
+ * all inodes in the file system, calculating the dquot accounting fields,
+ * and prints out any inconsistencies.
+ */
+xfs_dqhash_t *qmtest_udqtab;
+xfs_dqhash_t *qmtest_gdqtab;
+int	      qmtest_hashmask;
+int	      qmtest_nfails;
+mutex_t	      qcheck_lock;
+
+#define DQTEST_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
+				 (__psunsigned_t)(id)) & \
+				(qmtest_hashmask - 1))
+
+#define DQTEST_HASH(mp, id, type)   ((type & XFS_DQ_USER) ? \
+				     (qmtest_udqtab + \
+				      DQTEST_HASHVAL(mp, id)) : \
+				     (qmtest_gdqtab + \
+				      DQTEST_HASHVAL(mp, id)))
+
+#define DQTEST_LIST_PRINT(l, NXT, title) \
+{ \
+	  xfs_dqtest_t	*dqp; int i = 0;\
+	  printk("%s (#%d)\n", title, (int) (l)->qh_nelems); \
+	  for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \
+	       dqp = (xfs_dqtest_t *)dqp->NXT) { \
+	    printk("\t%d\.\t\"%d (%s)\"\t bcnt = %d, icnt = %d\n", \
+			 ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp),	     \
+			 dqp->d_bcount, dqp->d_icount); } \
+}
+
+typedef struct dqtest {
+	xfs_dqmarker_t	q_lists;
+	xfs_dqhash_t	*q_hash;	/* the hashchain header */
+	xfs_mount_t	*q_mount;	/* filesystem this relates to */
+	xfs_dqid_t	d_id;		/* user id or group id */
+	xfs_qcnt_t	d_bcount;	/* # disk blocks owned by the user */
+	xfs_qcnt_t	d_icount;	/* # inodes owned by the user */
+} xfs_dqtest_t;
+
+STATIC void
+xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
+{
+	xfs_dquot_t *d;
+	if (((d) = (h)->qh_next))
+		(d)->HL_PREVP = &((dqp)->HL_NEXT);
+	(dqp)->HL_NEXT = d;
+	(dqp)->HL_PREVP = &((h)->qh_next);
+	(h)->qh_next = (xfs_dquot_t *)dqp;
+	(h)->qh_version++;
+	(h)->qh_nelems++;
+}
+STATIC void
+xfs_qm_dqtest_print(
+	xfs_dqtest_t	*d)
+{
+	printk("-----------DQTEST DQUOT----------------\n");
+	printk("---- dquot ID =	 %d\n", d->d_id);
+	printk("---- type     =	 %s\n", XFS_QM_ISUDQ(d) ? "USR" : "GRP");
+	printk("---- fs	      =	 0x%p\n", d->q_mount);
+	printk("---- bcount   =	 %Lu (0x%x)\n", d->d_bcount, (int)d->d_bcount);
+	printk("---- icount   =	 %Lu (0x%x)\n", d->d_icount, (int)d->d_icount);
+	printk("---------------------------\n");
+}
+
+STATIC void
+xfs_qm_dqtest_failed(
+	xfs_dqtest_t	*d,
+	xfs_dquot_t	*dqp,
+	char		*reason,
+	xfs_qcnt_t	a,
+	xfs_qcnt_t	b,
+	int		error)
+{
+	qmtest_nfails++;
+	if (error)
+		printk("quotacheck failed for %d, error = %d\nreason = %s\n",
+		       INT_GET(d->d_id, ARCH_CONVERT), error, reason);
+	else
+		printk("quotacheck failed for %d (%s) [%d != %d]\n",
+		       INT_GET(d->d_id, ARCH_CONVERT), reason, (int)a, (int)b);
+	xfs_qm_dqtest_print(d);
+	if (dqp)
+		xfs_qm_dqprint(dqp);
+}
+
+STATIC int
+xfs_dqtest_cmp2(
+	xfs_dqtest_t	*d,
+	xfs_dquot_t	*dqp)
+{
+	int err = 0;
+	if (INT_GET(dqp->q_core.d_icount, ARCH_CONVERT) != d->d_icount) {
+		xfs_qm_dqtest_failed(d, dqp, "icount mismatch",
+				     INT_GET(dqp->q_core.d_icount, ARCH_CONVERT), d->d_icount, 0);
+		err++;
+	}
+	if (INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT) != d->d_bcount) {
+		xfs_qm_dqtest_failed(d, dqp, "bcount mismatch",
+				     INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT), d->d_bcount, 0);
+		err++;
+	}
+	if (INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT) &&
+	    INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT) >= INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT)) {
+		if (INT_ISZERO(dqp->q_core.d_btimer, ARCH_CONVERT) &&
+		    !INT_ISZERO(dqp->q_core.d_id, ARCH_CONVERT)) {
+			printk("%d [%s] [0x%p] BLK TIMER NOT STARTED\n",
+			       d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
+			err++;
+		}
+	}
+	if (INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT) &&
+	    INT_GET(dqp->q_core.d_icount, ARCH_CONVERT) >= INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT)) {
+		if (INT_ISZERO(dqp->q_core.d_itimer, ARCH_CONVERT) &&
+		    !INT_ISZERO(dqp->q_core.d_id, ARCH_CONVERT)) {
+			printk("%d [%s] [0x%p] INO TIMER NOT STARTED\n",
+			       d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount);
+			err++;
+		}
+	}
+#if 0
+	if (!err) {
+		printk("%d [%s] [0x%p] qchecked\n",
+		       d->d_id, XFS_QM_ISUDQ(d) ? "USR" : "GRP", d->q_mount);
+	}
+#endif
+	return (err);
+}
+
+STATIC void
+xfs_dqtest_cmp(
+	xfs_dqtest_t	*d)
+{
+	xfs_dquot_t	*dqp;
+	int		error;
+
+	/* xfs_qm_dqtest_print(d); */
+	if ((error = xfs_qm_dqget(d->q_mount, NULL, d->d_id, d->dq_flags, 0,
+				 &dqp))) {
+		xfs_qm_dqtest_failed(d, NULL, "dqget failed", 0, 0, error);
+		return;
+	}
+	xfs_dqtest_cmp2(d, dqp);
+	xfs_qm_dqput(dqp);
+}
+
+STATIC int
+xfs_qm_internalqcheck_dqget(
+	xfs_mount_t	*mp,
+	xfs_dqid_t	id,
+	uint		type,
+	xfs_dqtest_t	**O_dq)
+{
+	xfs_dqtest_t	*d;
+	xfs_dqhash_t	*h;
+
+	h = DQTEST_HASH(mp, id, type);
+	for (d = (xfs_dqtest_t *) h->qh_next; d != NULL;
+	     d = (xfs_dqtest_t *) d->HL_NEXT) {
+		/* DQTEST_LIST_PRINT(h, HL_NEXT, "@@@@@ dqtestlist @@@@@"); */
+		if (d->d_id == id && mp == d->q_mount) {
+			*O_dq = d;
+			return (0);
+		}
+	}
+	d = kmem_zalloc(sizeof(xfs_dqtest_t), KM_SLEEP);
+	d->dq_flags = type;
+	d->d_id = id;
+	d->q_mount = mp;
+	d->q_hash = h;
+	xfs_qm_hashinsert(h, d);
+	*O_dq = d;
+	return (0);
+}
+
+STATIC void
+xfs_qm_internalqcheck_get_dquots(
+	xfs_mount_t	*mp,
+	xfs_dqid_t	uid,
+	xfs_dqid_t	gid,
+	xfs_dqtest_t	**ud,
+	xfs_dqtest_t	**gd)
+{
+	if (XFS_IS_UQUOTA_ON(mp))
+		xfs_qm_internalqcheck_dqget(mp, uid, XFS_DQ_USER, ud);
+	if (XFS_IS_GQUOTA_ON(mp))
+		xfs_qm_internalqcheck_dqget(mp, gid, XFS_DQ_GROUP, gd);
+}
+
+
+STATIC void
+xfs_qm_internalqcheck_dqadjust(
+	xfs_inode_t		*ip,
+	xfs_dqtest_t		*d)
+{
+	d->d_icount++;
+	d->d_bcount += (xfs_qcnt_t)ip->i_d.di_nblocks;
+}
+
+STATIC int
+xfs_qm_internalqcheck_adjust(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_ino_t	ino,		/* inode number to get data for */
+	void		*buffer,	/* not used */
+	xfs_daddr_t	bno,		/* starting block of inode cluster */
+	void		*dip,		/* not used */
+	int		*res)		/* bulkstat result code */
+{
+	xfs_inode_t		*ip;
+	xfs_dqtest_t		*ud, *gd;
+	uint			lock_flags;
+	boolean_t		ipreleased;
+	int			error;
+
+	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+	if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
+		*res = BULKSTAT_RV_NOTHING;
+		qdprintk("internalqcheck: ino=%llu, uqino=%llu, gqino=%llu\n",
+			(unsigned long long) ino,
+			(unsigned long long) mp->m_sb.sb_uquotino,
+			(unsigned long long) mp->m_sb.sb_gquotino);
+		return XFS_ERROR(EINVAL);
+	}
+	ipreleased = B_FALSE;
+ again:
+	lock_flags = XFS_ILOCK_SHARED;
+	if ((error = xfs_iget(mp, tp, ino, lock_flags, &ip, bno))) {
+		*res = BULKSTAT_RV_NOTHING;
+		return (error);
+	}
+
+	if (ip->i_d.di_mode == 0) {
+		xfs_iput_new(ip, lock_flags);
+		*res = BULKSTAT_RV_NOTHING;
+		return XFS_ERROR(ENOENT);
+	}
+
+	/*
+	 * This inode can have blocks after eof which can get released
+	 * when we send it to inactive. Since we don't check the dquot
+	 * until the after all our calculations are done, we must get rid
+	 * of those now.
+	 */
+	if (! ipreleased) {
+		xfs_iput(ip, lock_flags);
+		ipreleased = B_TRUE;
+		goto again;
+	}
+	xfs_qm_internalqcheck_get_dquots(mp,
+					(xfs_dqid_t) ip->i_d.di_uid,
+					(xfs_dqid_t) ip->i_d.di_gid,
+					&ud, &gd);
+	if (XFS_IS_UQUOTA_ON(mp)) {
+		ASSERT(ud);
+		xfs_qm_internalqcheck_dqadjust(ip, ud);
+	}
+	if (XFS_IS_GQUOTA_ON(mp)) {
+		ASSERT(gd);
+		xfs_qm_internalqcheck_dqadjust(ip, gd);
+	}
+	xfs_iput(ip, lock_flags);
+	*res = BULKSTAT_RV_DIDONE;
+	return (0);
+}
+
+
+/* PRIVATE, debugging */
+int
+xfs_qm_internalqcheck(
+	xfs_mount_t	*mp)
+{
+	xfs_ino_t	lastino;
+	int		done, count;
+	int		i;
+	xfs_dqtest_t	*d, *e;
+	xfs_dqhash_t	*h1;
+	int		error;
+
+	lastino = 0;
+	qmtest_hashmask = 32;
+	count = 5;
+	done = 0;
+	qmtest_nfails = 0;
+
+	if (! XFS_IS_QUOTA_ON(mp))
+		return XFS_ERROR(ESRCH);
+
+	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+	XFS_bflush(mp->m_ddev_targp);
+	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+	XFS_bflush(mp->m_ddev_targp);
+
+	mutex_lock(&qcheck_lock, PINOD);
+	/* There should be absolutely no quota activity while this
+	   is going on. */
+	qmtest_udqtab = kmem_zalloc(qmtest_hashmask *
+				    sizeof(xfs_dqhash_t), KM_SLEEP);
+	qmtest_gdqtab = kmem_zalloc(qmtest_hashmask *
+				    sizeof(xfs_dqhash_t), KM_SLEEP);
+	do {
+		/*
+		 * Iterate thru all the inodes in the file system,
+		 * adjusting the corresponding dquot counters
+		 */
+		if ((error = xfs_bulkstat(mp, NULL, &lastino, &count,
+				 xfs_qm_internalqcheck_adjust,
+				 0, NULL, BULKSTAT_FG_IGET, &done))) {
+			break;
+		}
+	} while (! done);
+	if (error) {
+		printk("Bulkstat returned error 0x%x\n",
+		       error);
+	}
+	printk("Checking results against system dquots\n");
+	for (i = 0; i < qmtest_hashmask; i++) {
+		h1 = &qmtest_udqtab[i];
+		for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
+			xfs_dqtest_cmp(d);
+			e = (xfs_dqtest_t *) d->HL_NEXT;
+			kmem_free(d, sizeof(xfs_dqtest_t));
+			d = e;
+		}
+		h1 = &qmtest_gdqtab[i];
+		for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
+			xfs_dqtest_cmp(d);
+			e = (xfs_dqtest_t *) d->HL_NEXT;
+			kmem_free(d, sizeof(xfs_dqtest_t));
+			d = e;
+		}
+	}
+
+	if (qmtest_nfails) {
+		printk("**************	quotacheck failed  **************\n");
+		printk("failures = %d\n", qmtest_nfails);
+	} else {
+		printk("**************	quotacheck successful! **************\n");
+	}
+	kmem_free(qmtest_udqtab, qmtest_hashmask * sizeof(xfs_dqhash_t));
+	kmem_free(qmtest_gdqtab, qmtest_hashmask * sizeof(xfs_dqhash_t));
+	mutex_unlock(&qcheck_lock);
+	return (qmtest_nfails);
+}
+
+#endif /* DEBUG */
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
new file mode 100644
index 000000000000..37639212e07d
--- /dev/null
+++ b/fs/xfs/xfs_quota.h
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_QUOTA_H__
+#define __XFS_QUOTA_H__
+
+/*
+ * uid_t and gid_t are hard-coded to 32 bits in the inode.
+ * Hence, an 'id' in a dquot is 32 bits..
+ */
+typedef __int32_t	xfs_dqid_t;
+
+/*
+ * Eventhough users may not have quota limits occupying all 64-bits,
+ * they may need 64-bit accounting. Hence, 64-bit quota-counters,
+ * and quota-limits. This is a waste in the common case, but hey ...
+ */
+typedef __uint64_t	xfs_qcnt_t;
+typedef __uint16_t	xfs_qwarncnt_t;
+
+/*
+ * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
+ */
+#define XFS_UQUOTA_ACCT 0x0001	/* user quota accounting ON */
+#define XFS_UQUOTA_ENFD 0x0002	/* user quota limits enforced */
+#define XFS_UQUOTA_CHKD 0x0004	/* quotacheck run on usr quotas */
+#define XFS_PQUOTA_ACCT 0x0008	/* (IRIX) project quota accounting ON */
+#define XFS_GQUOTA_ENFD 0x0010	/* group quota limits enforced */
+#define XFS_GQUOTA_CHKD 0x0020	/* quotacheck run on grp quotas */
+#define XFS_GQUOTA_ACCT 0x0040	/* group quota accounting ON */
+
+/*
+ * Incore only flags for quotaoff - these bits get cleared when quota(s)
+ * are in the process of getting turned off. These flags are in m_qflags but
+ * never in sb_qflags.
+ */
+#define XFS_UQUOTA_ACTIVE	0x0080	/* uquotas are being turned off */
+#define XFS_GQUOTA_ACTIVE	0x0100	/* gquotas are being turned off */
+
+/*
+ * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
+ * quota will be not be switched off as long as that inode lock is held.
+ */
+#define XFS_IS_QUOTA_ON(mp)	((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
+						   XFS_GQUOTA_ACTIVE))
+#define XFS_IS_UQUOTA_ON(mp)	((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
+#define XFS_IS_GQUOTA_ON(mp)	((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
+
+/*
+ * Flags to tell various functions what to do. Not all of these are meaningful
+ * to a single function. None of these XFS_QMOPT_* flags are meant to have
+ * persistent values (ie. their values can and will change between versions)
+ */
+#define XFS_QMOPT_DQLOCK	0x0000001 /* dqlock */
+#define XFS_QMOPT_DQALLOC	0x0000002 /* alloc dquot ondisk if needed */
+#define XFS_QMOPT_UQUOTA	0x0000004 /* user dquot requested */
+#define XFS_QMOPT_GQUOTA	0x0000008 /* group dquot requested */
+#define XFS_QMOPT_FORCE_RES	0x0000010 /* ignore quota limits */
+#define XFS_QMOPT_DQSUSER	0x0000020 /* don't cache super users dquot */
+#define XFS_QMOPT_SBVERSION	0x0000040 /* change superblock version num */
+#define XFS_QMOPT_QUOTAOFF	0x0000080 /* quotas are being turned off */
+#define XFS_QMOPT_UMOUNTING	0x0000100 /* filesys is being unmounted */
+#define XFS_QMOPT_DOLOG		0x0000200 /* log buf changes (in quotacheck) */
+#define XFS_QMOPT_DOWARN	0x0000400 /* increase warning cnt if necessary */
+#define XFS_QMOPT_ILOCKED	0x0000800 /* inode is already locked (excl) */
+#define XFS_QMOPT_DQREPAIR	0x0001000 /* repair dquot, if damaged. */
+
+/*
+ * flags to xfs_trans_mod_dquot to indicate which field needs to be
+ * modified.
+ */
+#define XFS_QMOPT_RES_REGBLKS	0x0010000
+#define XFS_QMOPT_RES_RTBLKS	0x0020000
+#define XFS_QMOPT_BCOUNT	0x0040000
+#define XFS_QMOPT_ICOUNT	0x0080000
+#define XFS_QMOPT_RTBCOUNT	0x0100000
+#define XFS_QMOPT_DELBCOUNT	0x0200000
+#define XFS_QMOPT_DELRTBCOUNT	0x0400000
+#define XFS_QMOPT_RES_INOS	0x0800000
+
+/*
+ * flags for dqflush and dqflush_all.
+ */
+#define XFS_QMOPT_SYNC		0x1000000
+#define XFS_QMOPT_ASYNC		0x2000000
+#define XFS_QMOPT_DELWRI	0x4000000
+
+/*
+ * flags for dqalloc.
+ */
+#define XFS_QMOPT_INHERIT	0x8000000
+
+/*
+ * flags to xfs_trans_mod_dquot.
+ */
+#define XFS_TRANS_DQ_RES_BLKS	XFS_QMOPT_RES_REGBLKS
+#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS
+#define XFS_TRANS_DQ_RES_INOS	XFS_QMOPT_RES_INOS
+#define XFS_TRANS_DQ_BCOUNT	XFS_QMOPT_BCOUNT
+#define XFS_TRANS_DQ_DELBCOUNT	XFS_QMOPT_DELBCOUNT
+#define XFS_TRANS_DQ_ICOUNT	XFS_QMOPT_ICOUNT
+#define XFS_TRANS_DQ_RTBCOUNT	XFS_QMOPT_RTBCOUNT
+#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
+
+
+#define XFS_QMOPT_QUOTALL	(XFS_QMOPT_UQUOTA|XFS_QMOPT_GQUOTA)
+#define XFS_QMOPT_RESBLK_MASK	(XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
+
+/*
+ * This check is done typically without holding the inode lock;
+ * that may seem racey, but it is harmless in the context that it is used.
+ * The inode cannot go inactive as long a reference is kept, and
+ * therefore if dquot(s) were attached, they'll stay consistent.
+ * If, for example, the ownership of the inode changes while
+ * we didnt have the inode locked, the appropriate dquot(s) will be
+ * attached atomically.
+ */
+#define XFS_NOT_DQATTACHED(mp, ip) ((XFS_IS_UQUOTA_ON(mp) &&\
+				     (ip)->i_udquot == NULL) || \
+				    (XFS_IS_GQUOTA_ON(mp) && \
+				     (ip)->i_gdquot == NULL))
+
+#define XFS_QM_NEED_QUOTACHECK(mp) ((XFS_IS_UQUOTA_ON(mp) && \
+				     (mp->m_sb.sb_qflags & \
+				      XFS_UQUOTA_CHKD) == 0) || \
+				    (XFS_IS_GQUOTA_ON(mp) && \
+				     (mp->m_sb.sb_qflags & \
+				      XFS_GQUOTA_CHKD) == 0))
+
+#define XFS_MOUNT_QUOTA_ALL	(XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
+				 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
+				 XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD)
+#define XFS_MOUNT_QUOTA_MASK	(XFS_MOUNT_QUOTA_ALL | XFS_UQUOTA_ACTIVE | \
+				 XFS_GQUOTA_ACTIVE)
+
+#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME)
+
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_XFS_QUOTA
+/*
+ * External Interface to the XFS disk quota subsystem.
+ */
+struct	xfs_disk_dquot;
+struct	xfs_dqhash;
+struct	xfs_dquot;
+struct	xfs_inode;
+struct	xfs_mount;
+struct	xfs_trans;
+
+/*
+ * Quota Manager Interface.
+ */
+extern struct xfs_qm   *xfs_qm_init(void);
+extern void		xfs_qm_destroy(struct xfs_qm *);
+extern int		xfs_qm_dqflush_all(struct xfs_mount *, int);
+extern int		xfs_qm_dqattach(struct xfs_inode *, uint);
+extern int		xfs_qm_dqpurge_all(struct xfs_mount *, uint);
+extern void		xfs_qm_mount_quotainit(struct xfs_mount *, uint);
+extern void		xfs_qm_unmount_quotadestroy(struct xfs_mount *);
+extern int		xfs_qm_mount_quotas(struct xfs_mount *);
+extern int		xfs_qm_unmount_quotas(struct xfs_mount *);
+extern void		xfs_qm_dqdettach_inode(struct xfs_inode *);
+extern int		xfs_qm_sync(struct xfs_mount *, short);
+
+/*
+ * Dquot interface.
+ */
+extern void		xfs_dqlock(struct xfs_dquot *);
+extern void		xfs_dqunlock(struct xfs_dquot *);
+extern void		xfs_dqunlock_nonotify(struct xfs_dquot *);
+extern void		xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
+extern void		xfs_qm_dqput(struct xfs_dquot *);
+extern void		xfs_qm_dqrele(struct xfs_dquot *);
+extern xfs_dqid_t	xfs_qm_dqid(struct xfs_dquot *);
+extern int		xfs_qm_dqget(struct xfs_mount *,
+				     struct xfs_inode *, xfs_dqid_t,
+				      uint, uint, struct xfs_dquot **);
+extern int		xfs_qm_dqcheck(struct xfs_disk_dquot *,
+				       xfs_dqid_t, uint, uint, char *);
+
+/*
+ * Vnodeops specific code that should actually be _in_ xfs_vnodeops.c, but
+ * is here because it's nicer to keep vnodeops (therefore, XFS) lean
+ * and clean.
+ */
+extern struct xfs_dquot *	xfs_qm_vop_chown(struct xfs_trans *,
+						 struct xfs_inode *,
+						 struct xfs_dquot **,
+						 struct xfs_dquot *);
+extern int		xfs_qm_vop_dqalloc(struct xfs_mount *,
+					   struct xfs_inode *,
+					   uid_t, gid_t, uint,
+					   struct xfs_dquot	**,
+					   struct xfs_dquot	**);
+
+extern int		xfs_qm_vop_chown_reserve(struct xfs_trans *,
+						 struct xfs_inode *,
+						 struct xfs_dquot *,
+						 struct xfs_dquot *,
+						 uint);
+
+extern int		xfs_qm_vop_rename_dqattach(struct xfs_inode **);
+extern void		xfs_qm_vop_dqattach_and_dqmod_newinode(
+						struct xfs_trans *,
+						struct xfs_inode *,
+						struct xfs_dquot *,
+						struct xfs_dquot *);
+
+
+/*
+ * Dquot Transaction interface
+ */
+extern void		xfs_trans_alloc_dqinfo(struct xfs_trans *);
+extern void		xfs_trans_free_dqinfo(struct xfs_trans *);
+extern void		xfs_trans_dup_dqinfo(struct xfs_trans *,
+					     struct xfs_trans *);
+extern void		xfs_trans_mod_dquot(struct xfs_trans *,
+					    struct xfs_dquot *,
+					    uint, long);
+extern void		xfs_trans_mod_dquot_byino(struct xfs_trans *,
+						  struct xfs_inode *,
+						  uint, long);
+extern void		xfs_trans_apply_dquot_deltas(struct xfs_trans *);
+extern void		xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
+
+extern int		xfs_trans_reserve_quota_nblks(struct xfs_trans *,
+						      struct xfs_inode *,
+						      long, long, uint);
+
+
+extern int		xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
+							 struct xfs_dquot *,
+							 struct xfs_dquot *,
+							 long, long, uint);
+extern void		xfs_trans_log_dquot(struct xfs_trans *,
+					    struct xfs_dquot *);
+extern void		xfs_trans_dqjoin(struct xfs_trans *,
+					 struct xfs_dquot *);
+extern void		xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
+
+# define _XQM_ZONE_DESTROY(z)	((z)? kmem_cache_destroy(z) : (void)0)
+
+#else
+# define xfs_qm_init()					(NULL)
+# define xfs_qm_destroy(xqm)				do { } while (0)
+# define xfs_qm_dqflush_all(m,t)			(ENOSYS)
+# define xfs_qm_dqattach(i,t)				(ENOSYS)
+# define xfs_qm_dqpurge_all(m,t)			(ENOSYS)
+# define xfs_qm_mount_quotainit(m,t)			do { } while (0)
+# define xfs_qm_unmount_quotadestroy(m)			do { } while (0)
+# define xfs_qm_mount_quotas(m)				(ENOSYS)
+# define xfs_qm_unmount_quotas(m)			(ENOSYS)
+# define xfs_qm_dqdettach_inode(i)			do { } while (0)
+# define xfs_qm_sync(m,t)				(ENOSYS)
+# define xfs_dqlock(d)					do { } while (0)
+# define xfs_dqunlock(d)				do { } while (0)
+# define xfs_dqunlock_nonotify(d)			do { } while (0)
+# define xfs_dqlock2(d1,d2)				do { } while (0)
+# define xfs_qm_dqput(d)				do { } while (0)
+# define xfs_qm_dqrele(d)				do { } while (0)
+# define xfs_qm_dqid(d)					(-1)
+# define xfs_qm_dqget(m,i,di,t,f,d)			(ENOSYS)
+# define xfs_qm_dqcheck(dd,di,t,f,s)			(ENOSYS)
+# define xfs_trans_alloc_dqinfo(t)			do { } while (0)
+# define xfs_trans_free_dqinfo(t)			do { } while (0)
+# define xfs_trans_dup_dqinfo(t1,t2)			do { } while (0)
+# define xfs_trans_mod_dquot(t,d,f,x)			do { } while (0)
+# define xfs_trans_mod_dquot_byino(t,i,f,x)		do { } while (0)
+# define xfs_trans_apply_dquot_deltas(t)		do { } while (0)
+# define xfs_trans_unreserve_and_mod_dquots(t)		do { } while (0)
+# define xfs_trans_reserve_quota_nblks(t,i,nb,ni,f)	(ENOSYS)
+# define xfs_trans_reserve_quota_bydquots(t,x,y,b,i,f)	(ENOSYS)
+# define xfs_trans_log_dquot(t,d)			do { } while (0)
+# define xfs_trans_dqjoin(t,d)				do { } while (0)
+# define xfs_qm_dqrele_all_inodes(m,t)			do { } while (0)
+# define xfs_qm_vop_chown(t,i,d1,d2)			(NULL)
+# define xfs_qm_vop_dqalloc(m,i,u,g,f,d1,d2)		(ENOSYS)
+# define xfs_qm_vop_chown_reserve(t,i,d1,d2,f)		(ENOSYS)
+# define xfs_qm_vop_rename_dqattach(i)			(ENOSYS)
+# define xfs_qm_vop_dqattach_and_dqmod_newinode(t,i,x,y) do { } while (0)
+# define _XQM_ZONE_DESTROY(z)				do { } while (0)
+#endif	/* CONFIG_XFS_QUOTA */
+
+/*
+ * Regular disk block quota reservations
+ */
+#define		xfs_trans_reserve_blkquota(tp, ip, nblks) \
+xfs_trans_reserve_quota_nblks(tp, ip, nblks, 0, XFS_QMOPT_RES_REGBLKS)
+
+#define		xfs_trans_reserve_blkquota_force(tp, ip, nblks) \
+xfs_trans_reserve_quota_nblks(tp, ip, nblks, 0, \
+		XFS_QMOPT_RES_REGBLKS|XFS_QMOPT_FORCE_RES)
+
+#define		xfs_trans_unreserve_blkquota(tp, ip, nblks) \
+(void)xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), 0, XFS_QMOPT_RES_REGBLKS)
+
+#define		xfs_trans_reserve_quota(tp, udq, gdq, nb, ni, f) \
+xfs_trans_reserve_quota_bydquots(tp, udq, gdq, nb, ni, f|XFS_QMOPT_RES_REGBLKS)
+
+#define		xfs_trans_unreserve_quota(tp, ud, gd, b, i, f) \
+xfs_trans_reserve_quota_bydquots(tp, ud, gd, -(b), -(i), f|XFS_QMOPT_RES_REGBLKS)
+
+/*
+ * Realtime disk block quota reservations
+ */
+#define		xfs_trans_reserve_rtblkquota(mp, tp, ip, nblks) \
+xfs_trans_reserve_quota_nblks(tp, ip, nblks, 0, XFS_QMOPT_RES_RTBLKS)
+
+#define		xfs_trans_unreserve_rtblkquota(tp, ip, nblks) \
+(void)xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), 0, XFS_QMOPT_RES_RTBLKS)
+
+#define		xfs_trans_reserve_rtquota(mp, tp, uq, pq, blks, f) \
+xfs_trans_reserve_quota_bydquots(mp, tp, uq, pq, blks, 0, f|XFS_QMOPT_RES_RTBLKS)
+
+#define		xfs_trans_unreserve_rtquota(tp, uq, pq, blks) \
+xfs_trans_reserve_quota_bydquots(tp, uq, pq, -(blks), XFS_QMOPT_RES_RTBLKS)
+
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h
new file mode 100644
index 000000000000..c7f8bb60e561
--- /dev/null
+++ b/fs/xfs/xfs_quota_priv.h
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_QUOTA_PRIV_H__
+#define __XFS_QUOTA_PRIV_H__
+
+/*
+ * Number of bmaps that we ask from bmapi when doing a quotacheck.
+ * We make this restriction to keep the memory usage to a minimum.
+ */
+#define XFS_DQITER_MAP_SIZE	10
+
+/* Number of dquots that fit in to a dquot block */
+#define XFS_QM_DQPERBLK(mp)	((mp)->m_quotainfo->qi_dqperchunk)
+
+#define XFS_ISLOCKED_INODE(ip)		(ismrlocked(&(ip)->i_lock, \
+					    MR_UPDATE | MR_ACCESS) != 0)
+#define XFS_ISLOCKED_INODE_EXCL(ip)	(ismrlocked(&(ip)->i_lock, \
+					    MR_UPDATE) != 0)
+
+#define XFS_DQ_IS_ADDEDTO_TRX(t, d)	((d)->q_transp == (t))
+
+#define XFS_QI_MPLRECLAIMS(mp)	((mp)->m_quotainfo->qi_dqreclaims)
+#define XFS_QI_UQIP(mp)		((mp)->m_quotainfo->qi_uquotaip)
+#define XFS_QI_GQIP(mp)		((mp)->m_quotainfo->qi_gquotaip)
+#define XFS_QI_DQCHUNKLEN(mp)	((mp)->m_quotainfo->qi_dqchunklen)
+#define XFS_QI_BTIMELIMIT(mp)	((mp)->m_quotainfo->qi_btimelimit)
+#define XFS_QI_RTBTIMELIMIT(mp) ((mp)->m_quotainfo->qi_rtbtimelimit)
+#define XFS_QI_ITIMELIMIT(mp)	((mp)->m_quotainfo->qi_itimelimit)
+#define XFS_QI_BWARNLIMIT(mp)	((mp)->m_quotainfo->qi_bwarnlimit)
+#define XFS_QI_IWARNLIMIT(mp)	((mp)->m_quotainfo->qi_iwarnlimit)
+#define XFS_QI_QOFFLOCK(mp)	((mp)->m_quotainfo->qi_quotaofflock)
+
+#define XFS_QI_MPL_LIST(mp)	((mp)->m_quotainfo->qi_dqlist)
+#define XFS_QI_MPLLOCK(mp)	((mp)->m_quotainfo->qi_dqlist.qh_lock)
+#define XFS_QI_MPLNEXT(mp)	((mp)->m_quotainfo->qi_dqlist.qh_next)
+#define XFS_QI_MPLNDQUOTS(mp)	((mp)->m_quotainfo->qi_dqlist.qh_nelems)
+
+#define XQMLCK(h)			(mutex_lock(&((h)->qh_lock), PINOD))
+#define XQMUNLCK(h)			(mutex_unlock(&((h)->qh_lock)))
+#ifdef DEBUG
+static inline int
+XQMISLCKD(xfs_dqhash_t *h)
+{
+	if (mutex_trylock(&h->qh_lock)) {
+		mutex_unlock(&h->qh_lock);
+		return 0;
+	}
+	return 1;
+}
+#endif
+
+#define XFS_DQ_HASH_LOCK(h)		XQMLCK(h)
+#define XFS_DQ_HASH_UNLOCK(h)		XQMUNLCK(h)
+#define XFS_DQ_IS_HASH_LOCKED(h)	XQMISLCKD(h)
+
+#define xfs_qm_mplist_lock(mp)		XQMLCK(&(XFS_QI_MPL_LIST(mp)))
+#define xfs_qm_mplist_unlock(mp)	XQMUNLCK(&(XFS_QI_MPL_LIST(mp)))
+#define XFS_QM_IS_MPLIST_LOCKED(mp)	XQMISLCKD(&(XFS_QI_MPL_LIST(mp)))
+
+#define xfs_qm_freelist_lock(qm)	XQMLCK(&((qm)->qm_dqfreelist))
+#define xfs_qm_freelist_unlock(qm)	XQMUNLCK(&((qm)->qm_dqfreelist))
+#define XFS_QM_IS_FREELIST_LOCKED(qm)	XQMISLCKD(&((qm)->qm_dqfreelist))
+
+/*
+ * Hash into a bucket in the dquot hash table, based on <mp, id>.
+ */
+#define XFS_DQ_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
+				 (__psunsigned_t)(id)) & \
+				(xfs_Gqm->qm_dqhashmask - 1))
+#define XFS_DQ_HASH(mp, id, type)   (type == XFS_DQ_USER ? \
+				     (xfs_Gqm->qm_usr_dqhtable + \
+				      XFS_DQ_HASHVAL(mp, id)) : \
+				     (xfs_Gqm->qm_grp_dqhtable + \
+				      XFS_DQ_HASHVAL(mp, id)))
+#define XFS_IS_DQTYPE_ON(mp, type)   (type == XFS_DQ_USER ? \
+				      XFS_IS_UQUOTA_ON(mp):XFS_IS_GQUOTA_ON(mp))
+#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
+	INT_ISZERO(dqp->q_core.d_blk_hardlimit, ARCH_CONVERT) && \
+	INT_ISZERO(dqp->q_core.d_blk_softlimit, ARCH_CONVERT) && \
+	INT_ISZERO(dqp->q_core.d_rtb_hardlimit, ARCH_CONVERT) && \
+	INT_ISZERO(dqp->q_core.d_rtb_softlimit, ARCH_CONVERT) && \
+	INT_ISZERO(dqp->q_core.d_ino_hardlimit, ARCH_CONVERT) && \
+	INT_ISZERO(dqp->q_core.d_ino_softlimit, ARCH_CONVERT) && \
+	INT_ISZERO(dqp->q_core.d_bcount, ARCH_CONVERT)	      && \
+	INT_ISZERO(dqp->q_core.d_rtbcount, ARCH_CONVERT)      && \
+	INT_ISZERO(dqp->q_core.d_icount, ARCH_CONVERT))
+
+#define HL_PREVP	dq_hashlist.ql_prevp
+#define HL_NEXT		dq_hashlist.ql_next
+#define MPL_PREVP	dq_mplist.ql_prevp
+#define MPL_NEXT	dq_mplist.ql_next
+
+
+#define _LIST_REMOVE(h, dqp, PVP, NXT)				\
+	{							\
+		 xfs_dquot_t *d;				\
+		 if (((d) = (dqp)->NXT))				\
+			 (d)->PVP = (dqp)->PVP;			\
+		 *((dqp)->PVP) = d;				\
+		 (dqp)->NXT = NULL;				\
+		 (dqp)->PVP = NULL;				\
+		 (h)->qh_version++;				\
+		 (h)->qh_nelems--;				\
+	}
+
+#define _LIST_INSERT(h, dqp, PVP, NXT)				\
+	{							\
+		 xfs_dquot_t *d;				\
+		 if (((d) = (h)->qh_next))			\
+			 (d)->PVP = &((dqp)->NXT);		\
+		 (dqp)->NXT = d;				\
+		 (dqp)->PVP = &((h)->qh_next);			\
+		 (h)->qh_next = dqp;				\
+		 (h)->qh_version++;				\
+		 (h)->qh_nelems++;				\
+	 }
+
+#define FOREACH_DQUOT_IN_MP(dqp, mp) \
+	for ((dqp) = XFS_QI_MPLNEXT(mp); (dqp) != NULL; (dqp) = (dqp)->MPL_NEXT)
+
+#define FOREACH_DQUOT_IN_FREELIST(dqp, qlist)	\
+for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \
+     (dqp) = (dqp)->dq_flnext)
+
+#define XQM_HASHLIST_INSERT(h, dqp)	\
+	 _LIST_INSERT(h, dqp, HL_PREVP, HL_NEXT)
+
+#define XQM_FREELIST_INSERT(h, dqp)	\
+	 xfs_qm_freelist_append(h, dqp)
+
+#define XQM_MPLIST_INSERT(h, dqp)	\
+	 _LIST_INSERT(h, dqp, MPL_PREVP, MPL_NEXT)
+
+#define XQM_HASHLIST_REMOVE(h, dqp)	\
+	 _LIST_REMOVE(h, dqp, HL_PREVP, HL_NEXT)
+#define XQM_FREELIST_REMOVE(dqp)	\
+	 xfs_qm_freelist_unlink(dqp)
+#define XQM_MPLIST_REMOVE(h, dqp)	\
+	{ _LIST_REMOVE(h, dqp, MPL_PREVP, MPL_NEXT); \
+	  XFS_QI_MPLRECLAIMS((dqp)->q_mount)++; }
+
+#define XFS_DQ_IS_LOGITEM_INITD(dqp)	((dqp)->q_logitem.qli_dquot == (dqp))
+
+#define XFS_QM_DQP_TO_DQACCT(tp, dqp)	(XFS_QM_ISUDQ(dqp) ? \
+					 (tp)->t_dqinfo->dqa_usrdquots : \
+					 (tp)->t_dqinfo->dqa_grpdquots)
+#define XFS_IS_SUSER_DQUOT(dqp)		\
+	(INT_ISZERO((dqp)->q_core.d_id, ARCH_CONVERT))
+
+#define XFS_PURGE_INODE(ip)		\
+	{				\
+	  vmap_t dqvmap;		\
+	  vnode_t *dqvp;		\
+	  dqvp = XFS_ITOV(ip);		\
+	  VMAP(dqvp, ip, dqvmap);	\
+	  VN_RELE(dqvp);		\
+	}
+
+#define DQFLAGTO_TYPESTR(d)	(((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
+				 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : "???"))
+#define DQFLAGTO_DIRTYSTR(d)	(XFS_DQ_IS_DIRTY(d) ? "DIRTY" : "NOTDIRTY")
+
+#endif	/* __XFS_QUOTA_PRIV_H__ */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
new file mode 100644
index 000000000000..3da2680d20c5
--- /dev/null
+++ b/fs/xfs/xfs_rename.c
@@ -0,0 +1,679 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+/*
+ * Given an array of up to 4 inode pointers, unlock the pointed to inodes.
+ * If there are fewer than 4 entries in the array, the empty entries will
+ * be at the end and will have NULL pointers in them.
+ */
+STATIC void
+xfs_rename_unlock4(
+	xfs_inode_t	**i_tab,
+	uint		lock_mode)
+{
+	int	i;
+
+	xfs_iunlock(i_tab[0], lock_mode);
+	for (i = 1; i < 4; i++) {
+		if (i_tab[i] == NULL) {
+			break;
+		}
+		/*
+		 * Watch out for duplicate entries in the table.
+		 */
+		if (i_tab[i] != i_tab[i-1]) {
+			xfs_iunlock(i_tab[i], lock_mode);
+		}
+	}
+}
+
+#ifdef DEBUG
+int xfs_rename_skip, xfs_rename_nskip;
+#endif
+
+/*
+ * The following routine will acquire the locks required for a rename
+ * operation. The code understands the semantics of renames and will
+ * validate that name1 exists under dp1 & that name2 may or may not
+ * exist under dp2.
+ *
+ * We are renaming dp1/name1 to dp2/name2.
+ *
+ * Return ENOENT if dp1 does not exist, other lookup errors, or 0 for success.
+ * Return EAGAIN if the caller needs to try again.
+ */
+STATIC int
+xfs_lock_for_rename(
+	xfs_inode_t	*dp1,	/* old (source) directory inode */
+	xfs_inode_t	*dp2,	/* new (target) directory inode */
+	struct dentry	*dentry1, /* old entry name */
+	struct dentry	*dentry2, /* new entry name */
+	xfs_inode_t	**ipp1, /* inode of old entry */
+	xfs_inode_t	**ipp2, /* inode of new entry, if it
+				   already exists, NULL otherwise. */
+	xfs_inode_t	**i_tab,/* array of inode returned, sorted */
+	int		*num_inodes)  /* number of inodes in array */
+{
+	xfs_inode_t		*ip1, *ip2, *temp;
+	xfs_ino_t		inum1, inum2;
+	int			error;
+	int			i, j;
+	uint			lock_mode;
+	uint			lookup_flags;
+	int			diff_dirs = (dp1 != dp2);
+
+	ip2 = NULL;
+
+	/*
+	 * First, find out the current inums of the entries so that we
+	 * can determine the initial locking order.  We'll have to
+	 * sanity check stuff after all the locks have been acquired
+	 * to see if we still have the right inodes, directories, etc.
+	 */
+	lock_mode = xfs_ilock_map_shared(dp1);
+	error = xfs_get_dir_entry(dentry1, &ip1);
+	if (error) {
+		xfs_iunlock_map_shared(dp1, lock_mode);
+		return error;
+	}
+
+	inum1 = ip1->i_ino;
+
+	ASSERT(ip1);
+	ITRACE(ip1);
+
+	/*
+	 * Unlock dp1 and lock dp2 if they are different.
+	 */
+
+	if (diff_dirs) {
+		xfs_iunlock_map_shared(dp1, lock_mode);
+		lock_mode = xfs_ilock_map_shared(dp2);
+	}
+
+	lookup_flags = DLF_IGET;
+	if (lock_mode == XFS_ILOCK_SHARED) {
+		lookup_flags |= DLF_LOCK_SHARED;
+	}
+	error = xfs_dir_lookup_int(XFS_ITOBHV(dp2), lookup_flags,
+				   dentry2, &inum2, &ip2);
+	if (error == ENOENT) {		/* target does not need to exist. */
+		inum2 = 0;
+	} else if (error) {
+		/*
+		 * If dp2 and dp1 are the same, the next line unlocks dp1.
+		 * Got it?
+		 */
+		xfs_iunlock_map_shared(dp2, lock_mode);
+		IRELE (ip1);
+		return error;
+	} else {
+		ITRACE(ip2);
+	}
+
+	/*
+	 * i_tab contains a list of pointers to inodes.	 We initialize
+	 * the table here & we'll sort it.  We will then use it to
+	 * order the acquisition of the inode locks.
+	 *
+	 * Note that the table may contain duplicates.	e.g., dp1 == dp2.
+	 */
+	i_tab[0] = dp1;
+	i_tab[1] = dp2;
+	i_tab[2] = ip1;
+	if (inum2 == 0) {
+		*num_inodes = 3;
+		i_tab[3] = NULL;
+	} else {
+		*num_inodes = 4;
+		i_tab[3] = ip2;
+	}
+
+	/*
+	 * Sort the elements via bubble sort.  (Remember, there are at
+	 * most 4 elements to sort, so this is adequate.)
+	 */
+	for (i=0; i < *num_inodes; i++) {
+		for (j=1; j < *num_inodes; j++) {
+			if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
+				temp = i_tab[j];
+				i_tab[j] = i_tab[j-1];
+				i_tab[j-1] = temp;
+			}
+		}
+	}
+
+	/*
+	 * We have dp2 locked. If it isn't first, unlock it.
+	 * If it is first, tell xfs_lock_inodes so it can skip it
+	 * when locking. if dp1 == dp2, xfs_lock_inodes will skip both
+	 * since they are equal. xfs_lock_inodes needs all these inodes
+	 * so that it can unlock and retry if there might be a dead-lock
+	 * potential with the log.
+	 */
+
+	if (i_tab[0] == dp2 && lock_mode == XFS_ILOCK_SHARED) {
+#ifdef DEBUG
+		xfs_rename_skip++;
+#endif
+		xfs_lock_inodes(i_tab, *num_inodes, 1, XFS_ILOCK_SHARED);
+	} else {
+#ifdef DEBUG
+		xfs_rename_nskip++;
+#endif
+		xfs_iunlock_map_shared(dp2, lock_mode);
+		xfs_lock_inodes(i_tab, *num_inodes, 0, XFS_ILOCK_SHARED);
+	}
+
+	/*
+	 * Set the return value. Null out any unused entries in i_tab.
+	 */
+	*ipp1 = *ipp2 = NULL;
+	for (i=0; i < *num_inodes; i++) {
+		if (i_tab[i]->i_ino == inum1) {
+			*ipp1 = i_tab[i];
+		}
+		if (i_tab[i]->i_ino == inum2) {
+			*ipp2 = i_tab[i];
+		}
+	}
+	for (;i < 4; i++) {
+		i_tab[i] = NULL;
+	}
+	return 0;
+}
+
+
+int rename_which_error_return = 0;
+
+#ifdef DEBUG
+int xfs_rename_agains;
+int xfs_renames;
+#endif
+
+/*
+ * xfs_rename
+ */
+int
+xfs_rename(
+	bhv_desc_t	*src_dir_bdp,
+	struct dentry	*src_dentry,
+	vnode_t		*target_dir_vp,
+	struct dentry	*target_dentry,
+	cred_t		*credp)
+{
+	xfs_trans_t	*tp;
+	xfs_inode_t	*src_dp, *target_dp, *src_ip, *target_ip;
+	xfs_mount_t	*mp;
+	int		new_parent;		/* moving to a new dir */
+	int		src_is_directory;	/* src_name is a directory */
+	int		error;
+	xfs_bmap_free_t free_list;
+	xfs_fsblock_t	first_block;
+	int		cancel_flags;
+	int		committed;
+	xfs_inode_t	*inodes[4];
+	int		target_ip_dropped = 0;	/* dropped target_ip link? */
+	vnode_t		*src_dir_vp;
+	bhv_desc_t	*target_dir_bdp;
+	int		spaceres;
+	int		target_link_zero = 0;
+	int		num_inodes;
+	char		*src_name = (char *)src_dentry->d_name.name;
+	char		*target_name = (char *)target_dentry->d_name.name;
+	int		src_namelen;
+	int		target_namelen;
+#ifdef DEBUG
+	int		retries;
+
+	xfs_renames++;
+#endif
+	src_dir_vp = BHV_TO_VNODE(src_dir_bdp);
+	vn_trace_entry(src_dir_vp, "xfs_rename", (inst_t *)__return_address);
+	vn_trace_entry(target_dir_vp, "xfs_rename", (inst_t *)__return_address);
+
+	/*
+	 * Find the XFS behavior descriptor for the target directory
+	 * vnode since it was not handed to us.
+	 */
+	target_dir_bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(target_dir_vp),
+						&xfs_vnodeops);
+	if (target_dir_bdp == NULL) {
+		return XFS_ERROR(EXDEV);
+	}
+	src_namelen = src_dentry->d_name.len;
+	if (src_namelen >= MAXNAMELEN)
+		return XFS_ERROR(ENAMETOOLONG);
+	target_namelen = target_dentry->d_name.len;
+	if (target_namelen >= MAXNAMELEN)
+		return XFS_ERROR(ENAMETOOLONG);
+	src_dp = XFS_BHVTOI(src_dir_bdp);
+	target_dp = XFS_BHVTOI(target_dir_bdp);
+	if (DM_EVENT_ENABLED(src_dir_vp->v_vfsp, src_dp, DM_EVENT_RENAME) ||
+	    DM_EVENT_ENABLED(target_dir_vp->v_vfsp,
+				target_dp, DM_EVENT_RENAME)) {
+		error = dm_send_namesp_event(DM_EVENT_RENAME,
+					src_dir_bdp, DM_RIGHT_NULL,
+					target_dir_bdp, DM_RIGHT_NULL,
+					src_name, target_name,
+					0, 0, 0);
+		if (error) {
+			return error;
+		}
+	}
+	/* Return through std_return after this point. */
+
+#ifdef DEBUG
+	retries = 0;
+#endif
+	/*
+	 * Lock all the participating inodes. Depending upon whether
+	 * the target_name exists in the target directory, and
+	 * whether the target directory is the same as the source
+	 * directory, we can lock from 2 to 4 inodes.
+	 * xfs_lock_for_rename() will return ENOENT if src_name
+	 * does not exist in the source directory.
+	 */
+	tp = NULL;
+	do {
+		error = xfs_lock_for_rename(src_dp, target_dp, src_dentry,
+				target_dentry, &src_ip, &target_ip, inodes,
+				&num_inodes);
+#ifdef DEBUG
+		if (error == EAGAIN)
+			xfs_rename_agains++;
+#endif
+	} while (error == EAGAIN);
+
+	if (error) {
+		rename_which_error_return = __LINE__;
+		/*
+		 * We have nothing locked, no inode references, and
+		 * no transaction, so just get out.
+		 */
+		goto std_return;
+	}
+
+	ASSERT(src_ip != NULL);
+
+	if ((src_ip->i_d.di_mode & IFMT) == IFDIR) {
+		/*
+		 * Check for link count overflow on target_dp
+		 */
+		if (target_ip == NULL && (src_dp != target_dp) &&
+		    target_dp->i_d.di_nlink >= XFS_MAXLINK) {
+			rename_which_error_return = __LINE__;
+			error = XFS_ERROR(EMLINK);
+			xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED);
+			goto rele_return;
+		}
+	}
+
+	new_parent = (src_dp != target_dp);
+	src_is_directory = ((src_ip->i_d.di_mode & IFMT) == IFDIR);
+
+	/*
+	 * Drop the locks on our inodes so that we can do the ancestor
+	 * check if necessary and start the transaction.
+	 */
+	xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED);
+
+	XFS_BMAP_INIT(&free_list, &first_block);
+	mp = src_dp->i_mount;
+	tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+	spaceres = XFS_RENAME_SPACE_RES(mp, target_namelen);
+	error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
+	if (error == ENOSPC) {
+		spaceres = 0;
+		error = xfs_trans_reserve(tp, 0, XFS_RENAME_LOG_RES(mp), 0,
+				XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
+	}
+	if (error) {
+		rename_which_error_return = __LINE__;
+		xfs_trans_cancel(tp, 0);
+		goto rele_return;
+	}
+
+	/*
+	 * Attach the dquots to the inodes
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if ((error = xfs_qm_vop_rename_dqattach(inodes))) {
+			xfs_trans_cancel(tp, cancel_flags);
+			rename_which_error_return = __LINE__;
+			goto rele_return;
+		}
+	}
+
+	/*
+	 * Reacquire the inode locks we dropped above.
+	 */
+	xfs_lock_inodes(inodes, num_inodes, 0, XFS_ILOCK_EXCL);
+
+	/*
+	 * Join all the inodes to the transaction. From this point on,
+	 * we can rely on either trans_commit or trans_cancel to unlock
+	 * them.  Note that we need to add a vnode reference to the
+	 * directories since trans_commit & trans_cancel will decrement
+	 * them when they unlock the inodes.  Also, we need to be careful
+	 * not to add an inode to the transaction more than once.
+	 */
+	VN_HOLD(src_dir_vp);
+	xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
+	if (new_parent) {
+		VN_HOLD(target_dir_vp);
+		xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
+	}
+	if ((src_ip != src_dp) && (src_ip != target_dp)) {
+		xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
+	}
+	if ((target_ip != NULL) &&
+	    (target_ip != src_ip) &&
+	    (target_ip != src_dp) &&
+	    (target_ip != target_dp)) {
+		xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
+	}
+
+	/*
+	 * Set up the target.
+	 */
+	if (target_ip == NULL) {
+		/*
+		 * If there's no space reservation, check the entry will
+		 * fit before actually inserting it.
+		 */
+		if (spaceres == 0 &&
+		    (error = XFS_DIR_CANENTER(mp, tp, target_dp, target_name,
+				target_namelen))) {
+			rename_which_error_return = __LINE__;
+			goto error_return;
+		}
+		/*
+		 * If target does not exist and the rename crosses
+		 * directories, adjust the target directory link count
+		 * to account for the ".." reference from the new entry.
+		 */
+		error = XFS_DIR_CREATENAME(mp, tp, target_dp, target_name,
+					   target_namelen, src_ip->i_ino,
+					   &first_block, &free_list, spaceres);
+		if (error == ENOSPC) {
+			rename_which_error_return = __LINE__;
+			goto error_return;
+		}
+		if (error) {
+			rename_which_error_return = __LINE__;
+			goto abort_return;
+		}
+		xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+		if (new_parent && src_is_directory) {
+			error = xfs_bumplink(tp, target_dp);
+			if (error) {
+				rename_which_error_return = __LINE__;
+				goto abort_return;
+			}
+		}
+	} else { /* target_ip != NULL */
+
+		/*
+		 * If target exists and it's a directory, check that both
+		 * target and source are directories and that target can be
+		 * destroyed, or that neither is a directory.
+		 */
+		if ((target_ip->i_d.di_mode & IFMT) == IFDIR) {
+			/*
+			 * Make sure target dir is empty.
+			 */
+			if (!(XFS_DIR_ISEMPTY(target_ip->i_mount, target_ip)) ||
+			    (target_ip->i_d.di_nlink > 2)) {
+				error = XFS_ERROR(EEXIST);
+				rename_which_error_return = __LINE__;
+				goto error_return;
+			}
+		}
+
+		/*
+		 * Link the source inode under the target name.
+		 * If the source inode is a directory and we are moving
+		 * it across directories, its ".." entry will be
+		 * inconsistent until we replace that down below.
+		 *
+		 * In case there is already an entry with the same
+		 * name at the destination directory, remove it first.
+		 */
+		error = XFS_DIR_REPLACE(mp, tp, target_dp, target_name,
+			target_namelen, src_ip->i_ino, &first_block,
+			&free_list, spaceres);
+		if (error) {
+			rename_which_error_return = __LINE__;
+			goto abort_return;
+		}
+		xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+		/*
+		 * Decrement the link count on the target since the target
+		 * dir no longer points to it.
+		 */
+		error = xfs_droplink(tp, target_ip);
+		if (error) {
+			rename_which_error_return = __LINE__;
+			goto abort_return;;
+		}
+		target_ip_dropped = 1;
+
+		/* Do this test while we still hold the locks */
+		target_link_zero = (target_ip)->i_d.di_nlink==0;
+
+		if (src_is_directory) {
+			/*
+			 * Drop the link from the old "." entry.
+			 */
+			error = xfs_droplink(tp, target_ip);
+			if (error) {
+				rename_which_error_return = __LINE__;
+				goto abort_return;
+			}
+		}
+
+	} /* target_ip != NULL */
+
+	/*
+	 * Remove the source.
+	 */
+	if (new_parent && src_is_directory) {
+
+		/*
+		 * Rewrite the ".." entry to point to the new
+		 * directory.
+		 */
+		error = XFS_DIR_REPLACE(mp, tp, src_ip, "..", 2,
+					target_dp->i_ino, &first_block,
+					&free_list, spaceres);
+		ASSERT(error != EEXIST);
+		if (error) {
+			rename_which_error_return = __LINE__;
+			goto abort_return;
+		}
+		xfs_ichgtime(src_ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+	} else {
+		/*
+		 * We always want to hit the ctime on the source inode.
+		 * We do it in the if clause above for the 'new_parent &&
+		 * src_is_directory' case, and here we get all the other
+		 * cases.  This isn't strictly required by the standards
+		 * since the source inode isn't really being changed,
+		 * but old unix file systems did it and some incremental
+		 * backup programs won't work without it.
+		 */
+		xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG);
+	}
+
+	/*
+	 * Adjust the link count on src_dp.  This is necessary when
+	 * renaming a directory, either within one parent when
+	 * the target existed, or across two parent directories.
+	 */
+	if (src_is_directory && (new_parent || target_ip != NULL)) {
+
+		/*
+		 * Decrement link count on src_directory since the
+		 * entry that's moved no longer points to it.
+		 */
+		error = xfs_droplink(tp, src_dp);
+		if (error) {
+			rename_which_error_return = __LINE__;
+			goto abort_return;
+		}
+	}
+
+	error = XFS_DIR_REMOVENAME(mp, tp, src_dp, src_name, src_namelen,
+			src_ip->i_ino, &first_block, &free_list, spaceres);
+	if (error) {
+		rename_which_error_return = __LINE__;
+		goto abort_return;
+	}
+	xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+	/*
+	 * Update the generation counts on all the directory inodes
+	 * that we're modifying.
+	 */
+	src_dp->i_gen++;
+	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
+
+	if (new_parent) {
+		target_dp->i_gen++;
+		xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
+	}
+
+	/*
+	 * If there was a target inode, take an extra reference on
+	 * it here so that it doesn't go to xfs_inactive() from
+	 * within the commit.
+	 */
+	if (target_ip != NULL) {
+		IHOLD(target_ip);
+	}
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * rename transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC) {
+		xfs_trans_set_sync(tp);
+	}
+
+	/*
+	 * Take refs. for vop_link_removed calls below.	 No need to worry
+	 * about directory refs. because the caller holds them.
+	 *
+	 * Do holds before the xfs_bmap_finish since it might rele them down
+	 * to zero.
+	 */
+
+	if (target_ip_dropped)
+		IHOLD(target_ip);
+	IHOLD(src_ip);
+
+	error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
+	if (error) {
+		xfs_bmap_cancel(&free_list);
+		xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
+				 XFS_TRANS_ABORT));
+		if (target_ip != NULL) {
+			IRELE(target_ip);
+		}
+		if (target_ip_dropped) {
+			IRELE(target_ip);
+		}
+		IRELE(src_ip);
+		goto std_return;
+	}
+
+	/*
+	 * trans_commit will unlock src_ip, target_ip & decrement
+	 * the vnode references.
+	 */
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	if (target_ip != NULL) {
+		xfs_refcache_purge_ip(target_ip);
+		IRELE(target_ip);
+	}
+	/*
+	 * Let interposed file systems know about removed links.
+	 */
+	if (target_ip_dropped) {
+		VOP_LINK_REMOVED(XFS_ITOV(target_ip), target_dir_vp,
+					target_link_zero);
+		IRELE(target_ip);
+	}
+
+	FSC_NOTIFY_NAME_CHANGED(XFS_ITOV(src_ip));
+
+	IRELE(src_ip);
+
+	/* Fall through to std_return with error = 0 or errno from
+	 * xfs_trans_commit	 */
+std_return:
+	if (DM_EVENT_ENABLED(src_dir_vp->v_vfsp, src_dp, DM_EVENT_POSTRENAME) ||
+	    DM_EVENT_ENABLED(target_dir_vp->v_vfsp,
+				target_dp, DM_EVENT_POSTRENAME)) {
+		(void) dm_send_namesp_event(DM_EVENT_POSTRENAME,
+					src_dir_bdp, DM_RIGHT_NULL,
+					target_dir_bdp, DM_RIGHT_NULL,
+					src_name, target_name,
+					0, error, 0);
+	}
+	return error;
+
+ abort_return:
+	cancel_flags |= XFS_TRANS_ABORT;
+	/* FALLTHROUGH */
+ error_return:
+	xfs_bmap_cancel(&free_list);
+	xfs_trans_cancel(tp, cancel_flags);
+	goto std_return;
+
+ rele_return:
+	IRELE(src_ip);
+	if (target_ip != NULL) {
+		IRELE(target_ip);
+	}
+	goto std_return;
+}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
new file mode 100644
index 000000000000..82684350d6eb
--- /dev/null
+++ b/fs/xfs/xfs_rtalloc.c
@@ -0,0 +1,2438 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+/*
+ * Free realtime space allocation for XFS.
+ */
+
+#include <xfs.h>
+
+
+/*
+ * Prototypes for internal functions.
+ */
+
+
+STATIC int xfs_rtallocate_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+		xfs_extlen_t, xfs_buf_t **, xfs_fsblock_t *);
+STATIC int xfs_rtany_summary(xfs_mount_t *, xfs_trans_t *, int, int,
+		xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, int *);
+STATIC int xfs_rtcheck_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+		xfs_extlen_t, int, xfs_rtblock_t *, int *);
+STATIC int xfs_rtfind_back(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+		xfs_rtblock_t, xfs_rtblock_t *);
+STATIC int xfs_rtfind_forw(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+		xfs_rtblock_t, xfs_rtblock_t *);
+STATIC int xfs_rtget_summary( xfs_mount_t *, xfs_trans_t *, int,
+		xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, xfs_suminfo_t *);
+STATIC int xfs_rtmodify_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
+		xfs_extlen_t, int);
+STATIC int xfs_rtmodify_summary(xfs_mount_t *, xfs_trans_t *, int,
+		xfs_rtblock_t, int, xfs_buf_t **, xfs_fsblock_t *);
+
+/*
+ * Internal functions.
+ */
+
+/*
+ * xfs_lowbit32: get low bit set out of 32-bit argument, -1 if none set.
+ */
+STATIC int
+xfs_lowbit32(
+	__uint32_t	v)
+{
+	return ffs(v)-1;
+}
+
+/*
+ * Allocate space to the bitmap or summary file, and zero it, for growfs.
+ */
+STATIC int				/* error */
+xfs_growfs_rt_alloc(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_extlen_t	oblocks,	/* old count of blocks */
+	xfs_extlen_t	nblocks,	/* new count of blocks */
+	xfs_ino_t	ino)		/* inode number (bitmap/summary) */
+{
+	xfs_fileoff_t	bno;		/* block number in file */
+	xfs_buf_t	*bp;		/* temporary buffer for zeroing */
+	int		cancelflags;	/* flags for xfs_trans_cancel */
+	int		committed;	/* transaction committed flag */
+	xfs_daddr_t	d;		/* disk block address */
+	int		error;		/* error return value */
+	xfs_fsblock_t	firstblock;	/* first block allocated in xaction */
+	xfs_bmap_free_t flist;		/* list of freed blocks */
+	xfs_fsblock_t	fsbno;		/* filesystem block for bno */
+	xfs_inode_t	*ip;		/* pointer to incore inode */
+	xfs_bmbt_irec_t map;		/* block map output */
+	int		nmap;		/* number of block maps */
+	int		resblks;	/* space reservation */
+	xfs_trans_t	*tp;		/* transaction pointer */
+
+	/*
+	 * Allocate space to the file, as necessary.
+	 */
+	while (oblocks < nblocks) {
+		tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
+		resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
+		cancelflags = 0;
+		/*
+		 * Reserve space & log for one extent added to the file.
+		 */
+		if ((error = xfs_trans_reserve(tp, resblks,
+				XFS_GROWRTALLOC_LOG_RES(mp), 0,
+				XFS_TRANS_PERM_LOG_RES,
+				XFS_DEFAULT_PERM_LOG_COUNT)))
+			goto error_exit;
+		cancelflags = XFS_TRANS_RELEASE_LOG_RES;
+		/*
+		 * Lock the inode.
+		 */
+		if ((error = xfs_trans_iget(mp, tp, ino, XFS_ILOCK_EXCL, &ip)))
+			goto error_exit;
+		XFS_BMAP_INIT(&flist, &firstblock);
+		/*
+		 * Allocate blocks to the bitmap file.
+		 */
+		nmap = 1;
+		cancelflags |= XFS_TRANS_ABORT;
+		error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks,
+			XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock,
+			resblks, &map, &nmap, &flist);
+		if (!error && nmap < 1)
+			error = XFS_ERROR(ENOSPC);
+		if (error)
+			goto error_exit;
+		/*
+		 * Free any blocks freed up in the transaction, then commit.
+		 */
+		error = xfs_bmap_finish(&tp, &flist, firstblock, &committed);
+		if (error)
+			goto error_exit;
+		xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+		/*
+		 * Now we need to clear the allocated blocks.
+		 * Do this one block per transaction, to keep it simple.
+		 */
+		cancelflags = 0;
+		for (bno = map.br_startoff, fsbno = map.br_startblock;
+		     bno < map.br_startoff + map.br_blockcount;
+		     bno++, fsbno++) {
+			tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO);
+			/*
+			 * Reserve log for one block zeroing.
+			 */
+			if ((error = xfs_trans_reserve(tp, 0,
+					XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0)))
+				goto error_exit;
+			/*
+			 * Lock the bitmap inode.
+			 */
+			if ((error = xfs_trans_iget(mp, tp, ino, XFS_ILOCK_EXCL,
+					&ip)))
+				goto error_exit;
+			/*
+			 * Get a buffer for the block.
+			 */
+			d = XFS_FSB_TO_DADDR(mp, fsbno);
+			bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+				mp->m_bsize, 0);
+			if (bp == NULL) {
+				error = XFS_ERROR(EIO);
+				goto error_exit;
+			}
+			bzero(XFS_BUF_PTR(bp), mp->m_sb.sb_blocksize);
+			xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
+			/*
+			 * Commit the transaction.
+			 */
+			xfs_trans_commit(tp, 0, NULL);
+		}
+		/*
+		 * Go on to the next extent, if any.
+		 */
+		oblocks = map.br_startoff + map.br_blockcount;
+	}
+	return 0;
+error_exit:
+	xfs_trans_cancel(tp, cancelflags);
+	return error;
+}
+
+/*
+ * Attempt to allocate an extent minlen<=len<=maxlen starting from
+ * bitmap block bbno.  If we don't get maxlen then use prod to trim
+ * the length, if given.  Returns error; returns starting block in *rtblock.
+ * The lengths are all in rtextents.
+ */
+STATIC int				/* error */
+xfs_rtallocate_extent_block(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	xfs_rtblock_t	*nextp,		/* out: next block to try */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	xfs_rtblock_t	besti;		/* best rtblock found so far */
+	xfs_rtblock_t	bestlen;	/* best length found so far */
+	xfs_rtblock_t	end;		/* last rtblock in chunk */
+	int		error;		/* error value */
+	xfs_rtblock_t	i;		/* current rtblock trying */
+	xfs_rtblock_t	next;		/* next rtblock to try */
+	int		stat;		/* status from internal calls */
+
+	/*
+	 * Loop over all the extents starting in this bitmap block,
+	 * looking for one that's long enough.
+	 */
+	for (i = XFS_BLOCKTOBIT(mp, bbno), besti = -1, bestlen = 0,
+		end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1;
+	     i <= end;
+	     i++) {
+		/*
+		 * See if there's a free extent of maxlen starting at i.
+		 * If it's not so then next will contain the first non-free.
+		 */
+		error = xfs_rtcheck_range(mp, tp, i, maxlen, 1, &next, &stat);
+		if (error) {
+			return error;
+		}
+		if (stat) {
+			/*
+			 * i for maxlen is all free, allocate and return that.
+			 */
+			error = xfs_rtallocate_range(mp, tp, i, maxlen, rbpp,
+				rsb);
+			if (error) {
+				return error;
+			}
+			*len = maxlen;
+			*rtblock = i;
+			return 0;
+		}
+		/*
+		 * In the case where we have a variable-sized allocation
+		 * request, figure out how big this free piece is,
+		 * and if it's big enough for the minimum, and the best
+		 * so far, remember it.
+		 */
+		if (minlen < maxlen) {
+			xfs_rtblock_t	thislen;	/* this extent size */
+
+			thislen = next - i;
+			if (thislen >= minlen && thislen > bestlen) {
+				besti = i;
+				bestlen = thislen;
+			}
+		}
+		/*
+		 * If not done yet, find the start of the next free space.
+		 */
+		if (next < end) {
+			error = xfs_rtfind_forw(mp, tp, next, end, &i);
+			if (error) {
+				return error;
+			}
+		} else
+			break;
+	}
+	/*
+	 * Searched the whole thing & didn't find a maxlen free extent.
+	 */
+	if (minlen < maxlen && besti != -1) {
+		xfs_extlen_t	p;	/* amount to trim length by */
+
+		/*
+		 * If size should be a multiple of prod, make that so.
+		 */
+		if (prod > 1 && (p = do_mod(bestlen, prod)))
+			bestlen -= p;
+		/*
+		 * Allocate besti for bestlen & return that.
+		 */
+		error = xfs_rtallocate_range(mp, tp, besti, bestlen, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+		*len = bestlen;
+		*rtblock = besti;
+		return 0;
+	}
+	/*
+	 * Allocation failed.  Set *nextp to the next block to try.
+	 */
+	*nextp = next;
+	*rtblock = NULLRTBLOCK;
+	return 0;
+}
+
+/*
+ * Allocate an extent of length minlen<=len<=maxlen, starting at block
+ * bno.	 If we don't get maxlen then use prod to trim the length, if given.
+ * Returns error; returns starting block in *rtblock.
+ * The lengths are all in rtextents.
+ */
+STATIC int				/* error */
+xfs_rtallocate_extent_exact(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number to allocate */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	int		error;		/* error value */
+	xfs_extlen_t	i;		/* extent length trimmed due to prod */
+	int		isfree;		/* extent is free */
+	xfs_rtblock_t	next;		/* next block to try (dummy) */
+
+	ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+	/*
+	 * Check if the range in question (for maxlen) is free.
+	 */
+	error = xfs_rtcheck_range(mp, tp, bno, maxlen, 1, &next, &isfree);
+	if (error) {
+		return error;
+	}
+	if (isfree) {
+		/*
+		 * If it is, allocate it and return success.
+		 */
+		error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+		*len = maxlen;
+		*rtblock = bno;
+		return 0;
+	}
+	/*
+	 * If not, allocate what there is, if it's at least minlen.
+	 */
+	maxlen = next - bno;
+	if (maxlen < minlen) {
+		/*
+		 * Failed, return failure status.
+		 */
+		*rtblock = NULLRTBLOCK;
+		return 0;
+	}
+	/*
+	 * Trim off tail of extent, if prod is specified.
+	 */
+	if (prod > 1 && (i = maxlen % prod)) {
+		maxlen -= i;
+		if (maxlen < minlen) {
+			/*
+			 * Now we can't do it, return failure status.
+			 */
+			*rtblock = NULLRTBLOCK;
+			return 0;
+		}
+	}
+	/*
+	 * Allocate what we can and return it.
+	 */
+	error = xfs_rtallocate_range(mp, tp, bno, maxlen, rbpp, rsb);
+	if (error) {
+		return error;
+	}
+	*len = maxlen;
+	*rtblock = bno;
+	return 0;
+}
+
+/*
+ * Allocate an extent of length minlen<=len<=maxlen, starting as near
+ * to bno as possible.	If we don't get maxlen then use prod to trim
+ * the length, if given.  The lengths are all in rtextents.
+ */
+STATIC int				/* error */
+xfs_rtallocate_extent_near(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number to allocate */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	int		any;		/* any useful extents from summary */
+	xfs_rtblock_t	bbno;		/* bitmap block number */
+	int		error;		/* error value */
+	int		i;		/* bitmap block offset (loop control) */
+	int		j;		/* secondary loop control */
+	int		log2len;	/* log2 of minlen */
+	xfs_rtblock_t	n;		/* next block to try */
+	xfs_rtblock_t	r;		/* result block */
+
+	ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+	/*
+	 * If the block number given is off the end, silently set it to
+	 * the last block.
+	 */
+	if (bno >= mp->m_sb.sb_rextents)
+		bno = mp->m_sb.sb_rextents - 1;
+	/*
+	 * Try the exact allocation first.
+	 */
+	error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen, len,
+		rbpp, rsb, prod, &r);
+	if (error) {
+		return error;
+	}
+	/*
+	 * If the exact allocation worked, return that.
+	 */
+	if (r != NULLRTBLOCK) {
+		*rtblock = r;
+		return 0;
+	}
+	bbno = XFS_BITTOBLOCK(mp, bno);
+	i = 0;
+	log2len = xfs_highbit32(minlen);
+	/*
+	 * Loop over all bitmap blocks (bbno + i is current block).
+	 */
+	for (;;) {
+		/*
+		 * Get summary information of extents of all useful levels
+		 * starting in this bitmap block.
+		 */
+		error = xfs_rtany_summary(mp, tp, log2len, mp->m_rsumlevels - 1,
+			bbno + i, rbpp, rsb, &any);
+		if (error) {
+			return error;
+		}
+		/*
+		 * If there are any useful extents starting here, try
+		 * allocating one.
+		 */
+		if (any) {
+			/*
+			 * On the positive side of the starting location.
+			 */
+			if (i >= 0) {
+				/*
+				 * Try to allocate an extent starting in
+				 * this block.
+				 */
+				error = xfs_rtallocate_extent_block(mp, tp,
+					bbno + i, minlen, maxlen, len, &n, rbpp,
+					rsb, prod, &r);
+				if (error) {
+					return error;
+				}
+				/*
+				 * If it worked, return it.
+				 */
+				if (r != NULLRTBLOCK) {
+					*rtblock = r;
+					return 0;
+				}
+			}
+			/*
+			 * On the negative side of the starting location.
+			 */
+			else {		/* i < 0 */
+				/*
+				 * Loop backwards through the bitmap blocks from
+				 * the starting point-1 up to where we are now.
+				 * There should be an extent which ends in this
+				 * bitmap block and is long enough.
+				 */
+				for (j = -1; j > i; j--) {
+					/*
+					 * Grab the summary information for
+					 * this bitmap block.
+					 */
+					error = xfs_rtany_summary(mp, tp,
+						log2len, mp->m_rsumlevels - 1,
+						bbno + j, rbpp, rsb, &any);
+					if (error) {
+						return error;
+					}
+					/*
+					 * If there's no extent given in the
+					 * summary that means the extent we
+					 * found must carry over from an
+					 * earlier block.  If there is an
+					 * extent given, we've already tried
+					 * that allocation, don't do it again.
+					 */
+					if (any)
+						continue;
+					error = xfs_rtallocate_extent_block(mp,
+						tp, bbno + j, minlen, maxlen,
+						len, &n, rbpp, rsb, prod, &r);
+					if (error) {
+						return error;
+					}
+					/*
+					 * If it works, return the extent.
+					 */
+					if (r != NULLRTBLOCK) {
+						*rtblock = r;
+						return 0;
+					}
+				}
+				/*
+				 * There weren't intervening bitmap blocks
+				 * with a long enough extent, or the
+				 * allocation didn't work for some reason
+				 * (i.e. it's a little * too short).
+				 * Try to allocate from the summary block
+				 * that we found.
+				 */
+				error = xfs_rtallocate_extent_block(mp, tp,
+					bbno + i, minlen, maxlen, len, &n, rbpp,
+					rsb, prod, &r);
+				if (error) {
+					return error;
+				}
+				/*
+				 * If it works, return the extent.
+				 */
+				if (r != NULLRTBLOCK) {
+					*rtblock = r;
+					return 0;
+				}
+			}
+		}
+		/*
+		 * Loop control.  If we were on the positive side, and there's
+		 * still more blocks on the negative side, go there.
+		 */
+		if (i > 0 && (int)bbno - i >= 0)
+			i = -i;
+		/*
+		 * If positive, and no more negative, but there are more
+		 * positive, go there.
+		 */
+		else if (i > 0 && (int)bbno + i < mp->m_sb.sb_rbmblocks - 1)
+			i++;
+		/*
+		 * If negative or 0 (just started), and there are positive
+		 * blocks to go, go there.  The 0 case moves to block 1.
+		 */
+		else if (i <= 0 && (int)bbno - i < mp->m_sb.sb_rbmblocks - 1)
+			i = 1 - i;
+		/*
+		 * If negative or 0 and there are more negative blocks,
+		 * go there.
+		 */
+		else if (i <= 0 && (int)bbno + i > 0)
+			i--;
+		/*
+		 * Must be done.  Return failure.
+		 */
+		else
+			break;
+	}
+	*rtblock = NULLRTBLOCK;
+	return 0;
+}
+
+/*
+ * Allocate an extent of length minlen<=len<=maxlen, with no position
+ * specified.  If we don't get maxlen then use prod to trim
+ * the length, if given.  The lengths are all in rtextents.
+ */
+STATIC int				/* error */
+xfs_rtallocate_extent_size(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	int		error;		/* error value */
+	int		i;		/* bitmap block number */
+	int		l;		/* level number (loop control) */
+	xfs_rtblock_t	n;		/* next block to be tried */
+	xfs_rtblock_t	r;		/* result block number */
+	xfs_suminfo_t	sum;		/* summary information for extents */
+
+	ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+	/*
+	 * Loop over all the levels starting with maxlen.
+	 * At each level, look at all the bitmap blocks, to see if there
+	 * are extents starting there that are long enough (>= maxlen).
+	 * Note, only on the initial level can the allocation fail if
+	 * the summary says there's an extent.
+	 */
+	for (l = xfs_highbit32(maxlen); l < mp->m_rsumlevels; l++) {
+		/*
+		 * Loop over all the bitmap blocks.
+		 */
+		for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
+			/*
+			 * Get the summary for this level/block.
+			 */
+			error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
+				&sum);
+			if (error) {
+				return error;
+			}
+			/*
+			 * Nothing there, on to the next block.
+			 */
+			if (!sum)
+				continue;
+			/*
+			 * Try allocating the extent.
+			 */
+			error = xfs_rtallocate_extent_block(mp, tp, i, maxlen,
+				maxlen, len, &n, rbpp, rsb, prod, &r);
+			if (error) {
+				return error;
+			}
+			/*
+			 * If it worked, return that.
+			 */
+			if (r != NULLRTBLOCK) {
+				*rtblock = r;
+				return 0;
+			}
+			/*
+			 * If the "next block to try" returned from the
+			 * allocator is beyond the next bitmap block,
+			 * skip to that bitmap block.
+			 */
+			if (XFS_BITTOBLOCK(mp, n) > i + 1)
+				i = XFS_BITTOBLOCK(mp, n) - 1;
+		}
+	}
+	/*
+	 * Didn't find any maxlen blocks.  Try smaller ones, unless
+	 * we're asking for a fixed size extent.
+	 */
+	if (minlen > --maxlen) {
+		*rtblock = NULLRTBLOCK;
+		return 0;
+	}
+	/*
+	 * Loop over sizes, from maxlen down to minlen.
+	 * This time, when we do the allocations, allow smaller ones
+	 * to succeed.
+	 */
+	for (l = xfs_highbit32(maxlen); l >= xfs_highbit32(minlen); l--) {
+		/*
+		 * Loop over all the bitmap blocks, try an allocation
+		 * starting in that block.
+		 */
+		for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
+			/*
+			 * Get the summary information for this level/block.
+			 */
+			error = xfs_rtget_summary(mp, tp, l, i, rbpp, rsb,
+						  &sum);
+			if (error) {
+				return error;
+			}
+			/*
+			 * If nothing there, go on to next.
+			 */
+			if (!sum)
+				continue;
+			/*
+			 * Try the allocation.	Make sure the specified
+			 * minlen/maxlen are in the possible range for
+			 * this summary level.
+			 */
+			error = xfs_rtallocate_extent_block(mp, tp, i,
+					XFS_RTMAX(minlen, 1 << l),
+					XFS_RTMIN(maxlen, (1 << (l + 1)) - 1),
+					len, &n, rbpp, rsb, prod, &r);
+			if (error) {
+				return error;
+			}
+			/*
+			 * If it worked, return that extent.
+			 */
+			if (r != NULLRTBLOCK) {
+				*rtblock = r;
+				return 0;
+			}
+			/*
+			 * If the "next block to try" returned from the
+			 * allocator is beyond the next bitmap block,
+			 * skip to that bitmap block.
+			 */
+			if (XFS_BITTOBLOCK(mp, n) > i + 1)
+				i = XFS_BITTOBLOCK(mp, n) - 1;
+		}
+	}
+	/*
+	 * Got nothing, return failure.
+	 */
+	*rtblock = NULLRTBLOCK;
+	return 0;
+}
+
+/*
+ * Mark an extent specified by start and len allocated.
+ * Updates all the summary information as well as the bitmap.
+ */
+STATIC int				/* error */
+xfs_rtallocate_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* start block to allocate */
+	xfs_extlen_t	len,		/* length to allocate */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+{
+	xfs_rtblock_t	end;		/* end of the allocated extent */
+	int		error;		/* error value */
+	xfs_rtblock_t	postblock;	/* first block allocated > end */
+	xfs_rtblock_t	preblock;	/* first block allocated < start */
+
+	end = start + len - 1;
+	/*
+	 * Assume we're allocating out of the middle of a free extent.
+	 * We need to find the beginning and end of the extent so we can
+	 * properly update the summary.
+	 */
+	error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Find the next allocated block (end of free extent).
+	 */
+	error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+		&postblock);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Decrement the summary information corresponding to the entire
+	 * (old) free extent.
+	 */
+	error = xfs_rtmodify_summary(mp, tp,
+		XFS_RTBLOCKLOG(postblock + 1 - preblock),
+		XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+	if (error) {
+		return error;
+	}
+	/*
+	 * If there are blocks not being allocated at the front of the
+	 * old extent, add summary data for them to be free.
+	 */
+	if (preblock < start) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(start - preblock),
+			XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * If there are blocks not being allocated at the end of the
+	 * old extent, add summary data for them to be free.
+	 */
+	if (postblock > end) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(postblock - end),
+			XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * Modify the bitmap to mark this extent allocated.
+	 */
+	error = xfs_rtmodify_range(mp, tp, start, len, 0);
+	return error;
+}
+
+/*
+ * Return whether there are any free extents in the size range given
+ * by low and high, for the bitmap block bbno.
+ */
+STATIC int				/* error */
+xfs_rtany_summary(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	int		low,		/* low log2 extent size */
+	int		high,		/* high log2 extent size */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	int		*stat)		/* out: any good extents here? */
+{
+	int		error;		/* error value */
+	int		log;		/* loop counter, log2 of ext. size */
+	xfs_suminfo_t	sum;		/* summary data */
+
+	/*
+	 * Loop over logs of extent sizes.  Order is irrelevant.
+	 */
+	for (log = low; log <= high; log++) {
+		/*
+		 * Get one summary datum.
+		 */
+		error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum);
+		if (error) {
+			return error;
+		}
+		/*
+		 * If there are any, return success.
+		 */
+		if (sum) {
+			*stat = 1;
+			return 0;
+		}
+	}
+	/*
+	 * Found nothing, return failure.
+	 */
+	*stat = 0;
+	return 0;
+}
+
+/*
+ * Get a buffer for the bitmap or summary file block specified.
+ * The buffer is returned read and locked.
+ */
+STATIC int				/* error */
+xfs_rtbuf_get(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	block,		/* block number in bitmap or summary */
+	int		issum,		/* is summary not bitmap */
+	xfs_buf_t	**bpp)		/* output: buffer for the block */
+{
+	xfs_buf_t	*bp;		/* block buffer, result */
+	xfs_daddr_t	d;		/* disk addr of block */
+	int		error;		/* error value */
+	xfs_fsblock_t	fsb;		/* fs block number for block */
+	xfs_inode_t	*ip;		/* bitmap or summary inode */
+
+	ip = issum ? mp->m_rsumip : mp->m_rbmip;
+	/*
+	 * Map from the file offset (block) and inode number to the
+	 * file system block.
+	 */
+	error = xfs_bmapi_single(tp, ip, XFS_DATA_FORK, &fsb, block);
+	if (error) {
+		return error;
+	}
+	ASSERT(fsb != NULLFSBLOCK);
+	/*
+	 * Convert to disk address for buffer cache.
+	 */
+	d = XFS_FSB_TO_DADDR(mp, fsb);
+	/*
+	 * Read the buffer.
+	 */
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+				   mp->m_bsize, 0, &bp);
+	if (error) {
+		return error;
+	}
+	ASSERT(bp && !XFS_BUF_GETERROR(bp));
+	*bpp = bp;
+	return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check that the given extent (block range) is allocated already.
+ */
+STATIC int				/* error */
+xfs_rtcheck_alloc_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number of extent */
+	xfs_extlen_t	len,		/* length of extent */
+	int		*stat)		/* out: 1 for allocated, 0 for not */
+{
+	xfs_rtblock_t	new;		/* dummy for xfs_rtcheck_range */
+
+	return xfs_rtcheck_range(mp, tp, bno, len, 0, &new, stat);
+}
+#endif
+
+#ifdef DEBUG
+/*
+ * Check whether the given block in the bitmap has the given value.
+ */
+STATIC int				/* 1 for matches, 0 for not */
+xfs_rtcheck_bit(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* bit (block) to check */
+	int		val)		/* 1 for free, 0 for allocated */
+{
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* pointer into the buffer */
+	/* REFERENCED */
+	int		error;		/* error value */
+	xfs_rtword_t	wdiff;		/* difference between bit & expected */
+	int		word;		/* word number in the buffer */
+	xfs_rtword_t	wval;		/* word value from buffer */
+
+	block = XFS_BITTOBLOCK(mp, start);
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+	word = XFS_BITTOWORD(mp, start);
+	bit = (int)(start & (XFS_NBWORD - 1));
+	wval = bufp[word];
+	xfs_trans_brelse(tp, bp);
+	wdiff = (wval ^ -val) & ((xfs_rtword_t)1 << bit);
+	return !wdiff;
+}
+#endif	/* DEBUG */
+
+#if 0
+/*
+ * Check that the given extent (block range) is free already.
+ */
+STATIC int				/* error */
+xfs_rtcheck_free_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number of extent */
+	xfs_extlen_t	len,		/* length of extent */
+	int		*stat)		/* out: 1 for free, 0 for not */
+{
+	xfs_rtblock_t	new;		/* dummy for xfs_rtcheck_range */
+
+	return xfs_rtcheck_range(mp, tp, bno, len, 1, &new, stat);
+}
+#endif
+
+/*
+ * Check that the given range is either all allocated (val = 0) or
+ * all free (val = 1).
+ */
+STATIC int				/* error */
+xfs_rtcheck_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block number of extent */
+	xfs_extlen_t	len,		/* length of extent */
+	int		val,		/* 1 for free, 0 for allocated */
+	xfs_rtblock_t	*new,		/* out: first block not matching */
+	int		*stat)		/* out: 1 for matches, 0 for not */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtblock_t	i;		/* current bit number rel. to start */
+	xfs_rtblock_t	lastbit;	/* last useful bit in word */
+	xfs_rtword_t	mask;		/* mask of relevant bits for value */
+	xfs_rtword_t	wdiff;		/* difference from wanted value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute starting bitmap block number
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	/*
+	 * Read the bitmap block.
+	 */
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+	/*
+	 * Compute the starting word's address, and starting bit.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	/*
+	 * 0 (allocated) => all zero's; 1 (free) => all one's.
+	 */
+	val = -val;
+	/*
+	 * If not starting on a word boundary, deal with the first
+	 * (partial) word.
+	 */
+	if (bit) {
+		/*
+		 * Compute first bit not examined.
+		 */
+		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		/*
+		 * Mask of relevant bits.
+		 */
+		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ val) & mask)) {
+			/*
+			 * Different, compute first wrong bit and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i = XFS_RTLOBIT(wdiff) - bit;
+			*new = start + i;
+			*stat = 0;
+			return 0;
+		}
+		i = lastbit - bit;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the next one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer.
+			 */
+			b++;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the next one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = *b ^ val)) {
+			/*
+			 * Different, compute first wrong bit and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*new = start + i;
+			*stat = 0;
+			return 0;
+		}
+		i += XFS_NBWORD;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the next one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer.
+			 */
+			b++;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if ((lastbit = len - i)) {
+		/*
+		 * Mask of relevant bits.
+		 */
+		mask = ((xfs_rtword_t)1 << lastbit) - 1;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ val) & mask)) {
+			/*
+			 * Different, compute first wrong bit and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*new = start + i;
+			*stat = 0;
+			return 0;
+		} else
+			i = len;
+	}
+	/*
+	 * Successful, return.
+	 */
+	xfs_trans_brelse(tp, bp);
+	*new = start + i;
+	*stat = 1;
+	return 0;
+}
+
+/*
+ * Copy and transform the summary file, given the old and new
+ * parameters in the mount structures.
+ */
+STATIC int				/* error */
+xfs_rtcopy_summary(
+	xfs_mount_t	*omp,		/* old file system mount point */
+	xfs_mount_t	*nmp,		/* new file system mount point */
+	xfs_trans_t	*tp)		/* transaction pointer */
+{
+	xfs_rtblock_t	bbno;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* summary buffer */
+	int		error;		/* error return value */
+	int		log;		/* summary level number (log length) */
+	xfs_suminfo_t	sum;		/* summary data */
+	xfs_fsblock_t	sumbno;		/* summary block number */
+
+	bp = NULL;
+	for (log = omp->m_rsumlevels - 1; log >= 0; log--) {
+		for (bbno = omp->m_sb.sb_rbmblocks - 1;
+		     (xfs_srtblock_t)bbno >= 0;
+		     bbno--) {
+			error = xfs_rtget_summary(omp, tp, log, bbno, &bp,
+				&sumbno, &sum);
+			if (error)
+				return error;
+			if (sum == 0)
+				continue;
+			error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum,
+				&bp, &sumbno);
+			if (error)
+				return error;
+			error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum,
+				&bp, &sumbno);
+			if (error)
+				return error;
+			ASSERT(sum > 0);
+		}
+	}
+	return 0;
+}
+
+/*
+ * Searching backward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+STATIC int				/* error */
+xfs_rtfind_back(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to look at */
+	xfs_rtblock_t	limit,		/* last block to look at */
+	xfs_rtblock_t	*rtblock)	/* out: start block found */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtblock_t	firstbit;	/* first useful bit in the word */
+	xfs_rtblock_t	i;		/* current bit number rel. to start */
+	xfs_rtblock_t	len;		/* length of inspected area */
+	xfs_rtword_t	mask;		/* mask of relevant bits for value */
+	xfs_rtword_t	want;		/* mask for "good" values */
+	xfs_rtword_t	wdiff;		/* difference from wanted value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute and read in starting bitmap block for starting block.
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+	/*
+	 * Get the first word's index & point to it.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	len = start - limit + 1;
+	/*
+	 * Compute match value, based on the bit at start: if 1 (free)
+	 * then all-ones, else all-zeroes.
+	 */
+	want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+	/*
+	 * If the starting position is not word-aligned, deal with the
+	 * partial word.
+	 */
+	if (bit < XFS_NBWORD - 1) {
+		/*
+		 * Calculate first (leftmost) bit number to look at,
+		 * and mask for all the relevant bits in this word.
+		 */
+		firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
+		mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
+			firstbit;
+		/*
+		 * Calculate the difference between the value there
+		 * and what we're looking for.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different.  Mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i = bit - XFS_RTHIBIT(wdiff);
+			*rtblock = start - i + 1;
+			return 0;
+		}
+		i = bit - firstbit + 1;
+		/*
+		 * Go on to previous block if that's where the previous word is
+		 * and we need the previous word.
+		 */
+		if (--word == -1 && i < len) {
+			/*
+			 * If done with this block, get the previous one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = XFS_BLOCKWMASK(mp);
+			b = &bufp[word];
+		} else {
+			/*
+			 * Go on to the previous word in the buffer.
+			 */
+			b--;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the previous one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = *b ^ want)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+			*rtblock = start - i + 1;
+			return 0;
+		}
+		i += XFS_NBWORD;
+		/*
+		 * Go on to previous block if that's where the previous word is
+		 * and we need the previous word.
+		 */
+		if (--word == -1 && i < len) {
+			/*
+			 * If done with this block, get the previous one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = XFS_BLOCKWMASK(mp);
+			b = &bufp[word];
+		} else {
+			/*
+			 * Go on to the previous word in the buffer.
+			 */
+			b--;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if (len - i) {
+		/*
+		 * Calculate first (leftmost) bit number to look at,
+		 * and mask for all the relevant bits in this word.
+		 */
+		firstbit = XFS_NBWORD - (len - i);
+		mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+			*rtblock = start - i + 1;
+			return 0;
+		} else
+			i = len;
+	}
+	/*
+	 * No match, return that we scanned the whole area.
+	 */
+	xfs_trans_brelse(tp, bp);
+	*rtblock = start - i + 1;
+	return 0;
+}
+
+/*
+ * Searching forward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+STATIC int				/* error */
+xfs_rtfind_forw(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to look at */
+	xfs_rtblock_t	limit,		/* last block to look at */
+	xfs_rtblock_t	*rtblock)	/* out: start block found */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtblock_t	i;		/* current bit number rel. to start */
+	xfs_rtblock_t	lastbit;	/* last useful bit in the word */
+	xfs_rtblock_t	len;		/* length of inspected area */
+	xfs_rtword_t	mask;		/* mask of relevant bits for value */
+	xfs_rtword_t	want;		/* mask for "good" values */
+	xfs_rtword_t	wdiff;		/* difference from wanted value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute and read in starting bitmap block for starting block.
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+	/*
+	 * Get the first word's index & point to it.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	len = limit - start + 1;
+	/*
+	 * Compute match value, based on the bit at start: if 1 (free)
+	 * then all-ones, else all-zeroes.
+	 */
+	want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+	/*
+	 * If the starting position is not word-aligned, deal with the
+	 * partial word.
+	 */
+	if (bit) {
+		/*
+		 * Calculate last (rightmost) bit number to look at,
+		 * and mask for all the relevant bits in this word.
+		 */
+		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+		/*
+		 * Calculate the difference between the value there
+		 * and what we're looking for.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different.  Mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i = XFS_RTLOBIT(wdiff) - bit;
+			*rtblock = start + i - 1;
+			return 0;
+		}
+		i = lastbit - bit;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the previous one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = 0;
+		} else {
+			/*
+			 * Go on to the previous word in the buffer.
+			 */
+			b++;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the next one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = *b ^ want)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*rtblock = start + i - 1;
+			return 0;
+		}
+		i += XFS_NBWORD;
+		/*
+		 * Go on to next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * If done with this block, get the next one.
+			 */
+			xfs_trans_brelse(tp, bp);
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer.
+			 */
+			b++;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if ((lastbit = len - i)) {
+		/*
+		 * Calculate mask for all the relevant bits in this word.
+		 */
+		mask = ((xfs_rtword_t)1 << lastbit) - 1;
+		/*
+		 * Compute difference between actual and desired value.
+		 */
+		if ((wdiff = (*b ^ want) & mask)) {
+			/*
+			 * Different, mark where we are and return.
+			 */
+			xfs_trans_brelse(tp, bp);
+			i += XFS_RTLOBIT(wdiff);
+			*rtblock = start + i - 1;
+			return 0;
+		} else
+			i = len;
+	}
+	/*
+	 * No match, return that we scanned the whole area.
+	 */
+	xfs_trans_brelse(tp, bp);
+	*rtblock = start + i - 1;
+	return 0;
+}
+
+/*
+ * Mark an extent specified by start and len freed.
+ * Updates all the summary information as well as the bitmap.
+ */
+STATIC int				/* error */
+xfs_rtfree_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to free */
+	xfs_extlen_t	len,		/* length to free */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+{
+	xfs_rtblock_t	end;		/* end of the freed extent */
+	int		error;		/* error value */
+	xfs_rtblock_t	postblock;	/* first block freed > end */
+	xfs_rtblock_t	preblock;	/* first block freed < start */
+
+	end = start + len - 1;
+	/*
+	 * Modify the bitmap to mark this extent freed.
+	 */
+	error = xfs_rtmodify_range(mp, tp, start, len, 1);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Assume we're freeing out of the middle of an allocated extent.
+	 * We need to find the beginning and end of the extent so we can
+	 * properly update the summary.
+	 */
+	error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Find the next allocated block (end of allocated extent).
+	 */
+	error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+		&postblock);
+	/*
+	 * If there are blocks not being freed at the front of the
+	 * old extent, add summary data for them to be allocated.
+	 */
+	if (preblock < start) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(start - preblock),
+			XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * If there are blocks not being freed at the end of the
+	 * old extent, add summary data for them to be allocated.
+	 */
+	if (postblock > end) {
+		error = xfs_rtmodify_summary(mp, tp,
+			XFS_RTBLOCKLOG(postblock - end),
+			XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
+		if (error) {
+			return error;
+		}
+	}
+	/*
+	 * Increment the summary information corresponding to the entire
+	 * (new) free extent.
+	 */
+	error = xfs_rtmodify_summary(mp, tp,
+		XFS_RTBLOCKLOG(postblock + 1 - preblock),
+		XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+	return error;
+}
+
+/*
+ * Read and return the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
+ */
+STATIC int				/* error */
+xfs_rtget_summary(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	int		log,		/* log2 of extent size */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb,		/* in/out: summary block number */
+	xfs_suminfo_t	*sum)		/* out: summary info for this block */
+{
+	xfs_buf_t	*bp;		/* buffer for summary block */
+	int		error;		/* error value */
+	xfs_fsblock_t	sb;		/* summary fsblock */
+	int		so;		/* index into the summary file */
+	xfs_suminfo_t	*sp;		/* pointer to returned data */
+
+	/*
+	 * Compute entry number in the summary file.
+	 */
+	so = XFS_SUMOFFS(mp, log, bbno);
+	/*
+	 * Compute the block number in the summary file.
+	 */
+	sb = XFS_SUMOFFSTOBLOCK(mp, so);
+	/*
+	 * If we have an old buffer, and the block number matches, use that.
+	 */
+	if (rbpp && *rbpp && *rsb == sb)
+		bp = *rbpp;
+	/*
+	 * Otherwise we have to get the buffer.
+	 */
+	else {
+		/*
+		 * If there was an old one, get rid of it first.
+		 */
+		if (rbpp && *rbpp)
+			xfs_trans_brelse(tp, *rbpp);
+		error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
+		if (error) {
+			return error;
+		}
+		/*
+		 * Remember this buffer and block for the next call.
+		 */
+		if (rbpp) {
+			*rbpp = bp;
+			*rsb = sb;
+		}
+	}
+	/*
+	 * Point to the summary information & copy it out.
+	 */
+	sp = XFS_SUMPTR(mp, bp, so);
+	*sum = *sp;
+	/*
+	 * Drop the buffer if we're not asked to remember it.
+	 */
+	if (!rbpp)
+		xfs_trans_brelse(tp, bp);
+	return 0;
+}
+
+/*
+ * Set the given range of bitmap bits to the given value.
+ * Do whatever I/O and logging is required.
+ */
+STATIC int				/* error */
+xfs_rtmodify_range(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to modify */
+	xfs_extlen_t	len,		/* length of extent to modify */
+	int		val)		/* 1 for free, 0 for allocated */
+{
+	xfs_rtword_t	*b;		/* current word in buffer */
+	int		bit;		/* bit number in the word */
+	xfs_rtblock_t	block;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* buf for the block */
+	xfs_rtword_t	*bufp;		/* starting word in buffer */
+	int		error;		/* error value */
+	xfs_rtword_t	*first;		/* first used word in the buffer */
+	int		i;		/* current bit number rel. to start */
+	int		lastbit;	/* last useful bit in word */
+	xfs_rtword_t	mask;		/* mask o frelevant bits for value */
+	int		word;		/* word number in the buffer */
+
+	/*
+	 * Compute starting bitmap block number.
+	 */
+	block = XFS_BITTOBLOCK(mp, start);
+	/*
+	 * Read the bitmap block, and point to its data.
+	 */
+	error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+	if (error) {
+		return error;
+	}
+	bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+	/*
+	 * Compute the starting word's address, and starting bit.
+	 */
+	word = XFS_BITTOWORD(mp, start);
+	first = b = &bufp[word];
+	bit = (int)(start & (XFS_NBWORD - 1));
+	/*
+	 * 0 (allocated) => all zeroes; 1 (free) => all ones.
+	 */
+	val = -val;
+	/*
+	 * If not starting on a word boundary, deal with the first
+	 * (partial) word.
+	 */
+	if (bit) {
+		/*
+		 * Compute first bit not changed and mask of relevant bits.
+		 */
+		lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+		mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+		/*
+		 * Set/clear the active bits.
+		 */
+		if (val)
+			*b |= mask;
+		else
+			*b &= ~mask;
+		i = lastbit - bit;
+		/*
+		 * Go on to the next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * Log the changed part of this block.
+			 * Get the next one.
+			 */
+			xfs_trans_log_buf(tp, bp,
+				(uint)((char *)first - (char *)bufp),
+				(uint)((char *)b - (char *)bufp));
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer
+			 */
+			b++;
+		}
+	} else {
+		/*
+		 * Starting on a word boundary, no partial word.
+		 */
+		i = 0;
+	}
+	/*
+	 * Loop over whole words in buffers.  When we use up one buffer
+	 * we move on to the next one.
+	 */
+	while (len - i >= XFS_NBWORD) {
+		/*
+		 * Set the word value correctly.
+		 */
+		*b = val;
+		i += XFS_NBWORD;
+		/*
+		 * Go on to the next block if that's where the next word is
+		 * and we need the next word.
+		 */
+		if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+			/*
+			 * Log the changed part of this block.
+			 * Get the next one.
+			 */
+			xfs_trans_log_buf(tp, bp,
+				(uint)((char *)first - (char *)bufp),
+				(uint)((char *)b - (char *)bufp));
+			error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+			if (error) {
+				return error;
+			}
+			first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp);
+			word = 0;
+		} else {
+			/*
+			 * Go on to the next word in the buffer
+			 */
+			b++;
+		}
+	}
+	/*
+	 * If not ending on a word boundary, deal with the last
+	 * (partial) word.
+	 */
+	if ((lastbit = len - i)) {
+		/*
+		 * Compute a mask of relevant bits.
+		 */
+		bit = 0;
+		mask = ((xfs_rtword_t)1 << lastbit) - 1;
+		/*
+		 * Set/clear the active bits.
+		 */
+		if (val)
+			*b |= mask;
+		else
+			*b &= ~mask;
+		b++;
+	}
+	/*
+	 * Log any remaining changed bytes.
+	 */
+	if (b > first)
+		xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
+			(uint)((char *)b - (char *)bufp - 1));
+	return 0;
+}
+
+/*
+ * Read and modify the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
+ */
+STATIC int				/* error */
+xfs_rtmodify_summary(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	int		log,		/* log2 of extent size */
+	xfs_rtblock_t	bbno,		/* bitmap block number */
+	int		delta,		/* change to make to summary info */
+	xfs_buf_t	**rbpp,		/* in/out: summary block buffer */
+	xfs_fsblock_t	*rsb)		/* in/out: summary block number */
+{
+	xfs_buf_t	*bp;		/* buffer for the summary block */
+	int		error;		/* error value */
+	xfs_fsblock_t	sb;		/* summary fsblock */
+	int		so;		/* index into the summary file */
+	xfs_suminfo_t	*sp;		/* pointer to returned data */
+
+	/*
+	 * Compute entry number in the summary file.
+	 */
+	so = XFS_SUMOFFS(mp, log, bbno);
+	/*
+	 * Compute the block number in the summary file.
+	 */
+	sb = XFS_SUMOFFSTOBLOCK(mp, so);
+	/*
+	 * If we have an old buffer, and the block number matches, use that.
+	 */
+	if (rbpp && *rbpp && *rsb == sb)
+		bp = *rbpp;
+	/*
+	 * Otherwise we have to get the buffer.
+	 */
+	else {
+		/*
+		 * If there was an old one, get rid of it first.
+		 */
+		if (rbpp && *rbpp)
+			xfs_trans_brelse(tp, *rbpp);
+		error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
+		if (error) {
+			return error;
+		}
+		/*
+		 * Remember this buffer and block for the next call.
+		 */
+		if (rbpp) {
+			*rbpp = bp;
+			*rsb = sb;
+		}
+	}
+	/*
+	 * Point to the summary information, modify and log it.
+	 */
+	sp = XFS_SUMPTR(mp, bp, so);
+	*sp += delta;
+	xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)XFS_BUF_PTR(bp)),
+		(uint)((char *)sp - (char *)XFS_BUF_PTR(bp) + sizeof(*sp) - 1));
+	return 0;
+}
+
+/*
+ * Visible (exported) functions.
+ */
+
+/*
+ * Grow the realtime area of the filesystem.
+ */
+int
+xfs_growfs_rt(
+	xfs_mount_t	*mp,		/* mount point for filesystem */
+	xfs_growfs_rt_t *in)		/* growfs rt input struct */
+{
+	xfs_rtblock_t	bmbno;		/* bitmap block number */
+	xfs_buf_t	*bp;		/* temporary buffer */
+	int		cancelflags;	/* flags for xfs_trans_cancel */
+	int		error;		/* error return value */
+	xfs_inode_t	*ip;		/* bitmap inode, used as lock */
+	xfs_mount_t	*nmp;		/* new (fake) mount structure */
+	xfs_drfsbno_t	nrblocks;	/* new number of realtime blocks */
+	xfs_extlen_t	nrbmblocks;	/* new number of rt bitmap blocks */
+	xfs_drtbno_t	nrextents;	/* new number of realtime extents */
+	uint8_t		nrextslog;	/* new log2 of sb_rextents */
+	xfs_extlen_t	nrsumblocks;	/* new number of summary blocks */
+	uint		nrsumlevels;	/* new rt summary levels */
+	uint		nrsumsize;	/* new size of rt summary, bytes */
+	xfs_sb_t	*nsbp;		/* new superblock */
+	xfs_extlen_t	rbmblocks;	/* current number of rt bitmap blocks */
+	xfs_extlen_t	rsumblocks;	/* current number of rt summary blks */
+	xfs_sb_t	*sbp;		/* old superblock */
+	xfs_fsblock_t	sumbno;		/* summary block number */
+	xfs_trans_t	*tp;		/* transaction pointer */
+
+	sbp = &mp->m_sb;
+	/*
+	 * Initial error checking.
+	 */
+	if (mp->m_rtdev_targp || mp->m_rbmip == NULL ||
+	    (nrblocks = in->newblocks) <= sbp->sb_rblocks ||
+	    (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize)))
+		return XFS_ERROR(EINVAL);
+	/*
+	 * Read in the last block of the device, make sure it exists.
+	 */
+	error = xfs_read_buf(mp, mp->m_rtdev_targp,
+		XFS_FSB_TO_BB(mp, in->newblocks) - 1, 1, 0, &bp);
+	if (error)
+		return error;
+	ASSERT(bp);
+	xfs_buf_relse(bp);
+	/*
+	 * Calculate new parameters.  These are the final values to be reached.
+	 */
+	nrextents = do_div(nrblocks, in->extsize);
+	nrbmblocks = roundup_64(nrextents, NBBY * sbp->sb_blocksize);
+	nrextslog = xfs_highbit32(nrextents);
+	nrsumlevels = nrextslog + 1;
+	nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks;
+	nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
+	nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
+	/*
+	 * New summary size can't be more than half the size of
+	 * the log.  This prevents us from getting a log overflow,
+	 * since we'll log basically the whole summary file at once.
+	 */
+	if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1))
+		return XFS_ERROR(EINVAL);
+	/*
+	 * Get the old block counts for bitmap and summary inodes.
+	 * These can't change since other growfs callers are locked out.
+	 */
+	rbmblocks = XFS_B_TO_FSB(mp, mp->m_rbmip->i_d.di_size);
+	rsumblocks = XFS_B_TO_FSB(mp, mp->m_rsumip->i_d.di_size);
+	/*
+	 * Allocate space to the bitmap and summary files, as necessary.
+	 */
+	if ((error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks,
+			mp->m_sb.sb_rbmino)))
+		return error;
+	if ((error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks,
+			mp->m_sb.sb_rsumino)))
+		return error;
+	nmp = NULL;
+	/*
+	 * Loop over the bitmap blocks.
+	 * We will do everything one bitmap block at a time.
+	 * Skip the current block if it is exactly full.
+	 * This also deals with the case where there were no rtextents before.
+	 */
+	for (bmbno = sbp->sb_rbmblocks -
+		     ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
+	     bmbno < nrbmblocks;
+	     bmbno++) {
+		/*
+		 * Allocate a new (fake) mount/sb.
+		 */
+		nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP);
+		*nmp = *mp;
+		nsbp = &nmp->m_sb;
+		/*
+		 * Calculate new sb and mount fields for this round.
+		 */
+		nsbp->sb_rextsize = in->extsize;
+		nsbp->sb_rbmblocks = bmbno + 1;
+		nsbp->sb_rblocks =
+			XFS_RTMIN(nrblocks,
+				  nsbp->sb_rbmblocks * NBBY *
+				  nsbp->sb_blocksize * nsbp->sb_rextsize);
+		nsbp->sb_rextents = do_div(nsbp->sb_rblocks, nsbp->sb_rextsize);
+		nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents);
+		nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
+		nrsumsize =
+			(uint)sizeof(xfs_suminfo_t) * nrsumlevels *
+			nsbp->sb_rbmblocks;
+		nrsumblocks = XFS_B_TO_FSB(mp, nrsumsize);
+		nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
+		/*
+		 * Start a transaction, get the log reservation.
+		 */
+		tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
+		cancelflags = 0;
+		if ((error = xfs_trans_reserve(tp, 0,
+				XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0)))
+			goto error_exit;
+		/*
+		 * Lock out other callers by grabbing the bitmap inode lock.
+		 */
+		if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino,
+				XFS_ILOCK_EXCL, &ip)))
+			goto error_exit;
+		ASSERT(ip == mp->m_rbmip);
+		/*
+		 * Update the bitmap inode's size.
+		 */
+		mp->m_rbmip->i_d.di_size =
+			nsbp->sb_rbmblocks * nsbp->sb_blocksize;
+		xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+		cancelflags |= XFS_TRANS_ABORT;
+		/*
+		 * Get the summary inode into the transaction.
+		 */
+		if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino,
+				XFS_ILOCK_EXCL, &ip)))
+			goto error_exit;
+		ASSERT(ip == mp->m_rsumip);
+		/*
+		 * Update the summary inode's size.
+		 */
+		mp->m_rsumip->i_d.di_size = nmp->m_rsumsize;
+		xfs_trans_log_inode(tp, mp->m_rsumip, XFS_ILOG_CORE);
+		/*
+		 * Copy summary data from old to new sizes.
+		 * Do this when the real size (not block-aligned) changes.
+		 */
+		if (sbp->sb_rbmblocks != nsbp->sb_rbmblocks ||
+		    mp->m_rsumlevels != nmp->m_rsumlevels) {
+			error = xfs_rtcopy_summary(mp, nmp, tp);
+			if (error)
+				goto error_exit;
+		}
+		/*
+		 * Update superblock fields.
+		 */
+		if (nsbp->sb_rextsize != sbp->sb_rextsize)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE,
+				nsbp->sb_rextsize - sbp->sb_rextsize);
+		if (nsbp->sb_rbmblocks != sbp->sb_rbmblocks)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS,
+				nsbp->sb_rbmblocks - sbp->sb_rbmblocks);
+		if (nsbp->sb_rblocks != sbp->sb_rblocks)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS,
+				nsbp->sb_rblocks - sbp->sb_rblocks);
+		if (nsbp->sb_rextents != sbp->sb_rextents)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS,
+				nsbp->sb_rextents - sbp->sb_rextents);
+		if (nsbp->sb_rextslog != sbp->sb_rextslog)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG,
+				nsbp->sb_rextslog - sbp->sb_rextslog);
+		/*
+		 * Free new extent.
+		 */
+		bp = NULL;
+		error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents,
+			nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
+		if (error)
+			goto error_exit;
+		/*
+		 * Mark more blocks free in the superblock.
+		 */
+		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS,
+			nsbp->sb_rextents - sbp->sb_rextents);
+		/*
+		 * Free the fake mp structure.
+		 */
+		kmem_free(nmp, sizeof(*nmp));
+		nmp = NULL;
+		/*
+		 * Update mp values into the real mp structure.
+		 */
+		mp->m_rsumlevels = nrsumlevels;
+		mp->m_rsumsize = nrsumsize;
+		/*
+		 * Commit the transaction.
+		 */
+		xfs_trans_commit(tp, 0, NULL);
+	}
+	return 0;
+
+	/*
+	 * Error paths come here.
+	 */
+error_exit:
+	if (nmp)
+		kmem_free(nmp, sizeof(*nmp));
+	xfs_trans_cancel(tp, cancelflags);
+	return error;
+}
+
+/*
+ * Allocate an extent in the realtime subvolume, with the usual allocation
+ * parameters.	The length units are all in realtime extents, as is the
+ * result block number.
+ */
+int					/* error */
+xfs_rtallocate_extent(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number to allocate */
+	xfs_extlen_t	minlen,		/* minimum length to allocate */
+	xfs_extlen_t	maxlen,		/* maximum length to allocate */
+	xfs_extlen_t	*len,		/* out: actual length allocated */
+	xfs_alloctype_t type,		/* allocation type XFS_ALLOCTYPE... */
+	int		wasdel,		/* was a delayed allocation extent */
+	xfs_extlen_t	prod,		/* extent product factor */
+	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
+{
+	int		error;		/* error value */
+	xfs_inode_t	*ip;		/* inode for bitmap file */
+	xfs_mount_t	*mp;		/* file system mount structure */
+	xfs_rtblock_t	r;		/* result allocated block */
+	xfs_fsblock_t	sb;		/* summary file block number */
+	xfs_buf_t	*sumbp;		/* summary file block buffer */
+
+	ASSERT(minlen > 0 && minlen <= maxlen);
+	mp = tp->t_mountp;
+	/*
+	 * If prod is set then figure out what to do to minlen and maxlen.
+	 */
+	if (prod > 1) {
+		xfs_extlen_t	i;
+
+		if ((i = maxlen % prod))
+			maxlen -= i;
+		if ((i = minlen % prod))
+			minlen += prod - i;
+		if (maxlen < minlen) {
+			*rtblock = NULLRTBLOCK;
+			return 0;
+		}
+	}
+	/*
+	 * Lock out other callers by grabbing the bitmap inode lock.
+	 */
+	error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, XFS_ILOCK_EXCL, &ip);
+	if (error) {
+		return error;
+	}
+	sumbp = NULL;
+	/*
+	 * Allocate by size, or near another block, or exactly at some block.
+	 */
+	switch (type) {
+	case XFS_ALLOCTYPE_ANY_AG:
+		error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len,
+				&sumbp, &sb, prod, &r);
+		break;
+	case XFS_ALLOCTYPE_NEAR_BNO:
+		error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen,
+				len, &sumbp, &sb, prod, &r);
+		break;
+	case XFS_ALLOCTYPE_THIS_BNO:
+		error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen,
+				len, &sumbp, &sb, prod, &r);
+		break;
+	default:
+		ASSERT(0);
+	}
+	if (error) {
+		return error;
+	}
+	/*
+	 * If it worked, update the superblock.
+	 */
+	if (r != NULLRTBLOCK) {
+		long	slen = (long)*len;
+
+		ASSERT(*len >= minlen && *len <= maxlen);
+		if (wasdel)
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen);
+		else
+			xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen);
+	}
+	*rtblock = r;
+	return 0;
+}
+
+/*
+ * Free an extent in the realtime subvolume.  Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int					/* error */
+xfs_rtfree_extent(
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	bno,		/* starting block number to free */
+	xfs_extlen_t	len)		/* length of extent freed */
+{
+	int		error;		/* error value */
+	xfs_inode_t	*ip;		/* bitmap file inode */
+	xfs_mount_t	*mp;		/* file system mount structure */
+	xfs_fsblock_t	sb;		/* summary file block number */
+	xfs_buf_t	*sumbp;		/* summary file block buffer */
+
+	mp = tp->t_mountp;
+	/*
+	 * Synchronize by locking the bitmap inode.
+	 */
+	error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, XFS_ILOCK_EXCL, &ip);
+	if (error) {
+		return error;
+	}
+#if defined(__KERNEL__) && defined(DEBUG)
+	/*
+	 * Check to see that this whole range is currently allocated.
+	 */
+	{
+		int	stat;		/* result from checking range */
+
+		error = xfs_rtcheck_alloc_range(mp, tp, bno, len, &stat);
+		if (error) {
+			return error;
+		}
+		ASSERT(stat);
+	}
+#endif
+	sumbp = NULL;
+	/*
+	 * Free the range of realtime blocks.
+	 */
+	error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
+	if (error) {
+		return error;
+	}
+	/*
+	 * Mark more blocks free in the superblock.
+	 */
+	xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
+	/*
+	 * If we've now freed all the blocks, reset the file sequence
+	 * number to 0.
+	 */
+	if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
+	    mp->m_sb.sb_rextents) {
+		if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
+			ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+		*(__uint64_t *)&ip->i_d.di_atime = 0;
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	}
+	return 0;
+}
+
+/*
+ * Initialize realtime fields in the mount structure.
+ */
+int				/* error */
+xfs_rtmount_init(
+	xfs_mount_t	*mp)	/* file system mount structure */
+{
+	xfs_buf_t	*bp;	/* buffer for last block of subvolume */
+	xfs_daddr_t	d;	/* address of last block of subvolume */
+	int		error;	/* error return value */
+	xfs_sb_t	*sbp;	/* filesystem superblock copy in mount */
+
+	sbp = &mp->m_sb;
+	if (sbp->sb_rblocks == 0)
+		return 0;
+	if (mp->m_rtdev_targp != NULL) {
+		printk(KERN_WARNING
+		"XFS: This FS has an RT subvol - specify -o rtdev on mount\n");
+		return XFS_ERROR(ENODEV);
+	}
+	mp->m_rsumlevels = sbp->sb_rextslog + 1;
+	mp->m_rsumsize =
+		(uint)sizeof(xfs_suminfo_t) * mp->m_rsumlevels *
+		sbp->sb_rbmblocks;
+	mp->m_rsumsize = roundup(mp->m_rsumsize, sbp->sb_blocksize);
+	mp->m_rbmip = mp->m_rsumip = NULL;
+	/*
+	 * Check that the realtime section is an ok size.
+	 */
+	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
+	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
+		printk(KERN_WARNING "XFS: RT mount - %llu != %llu\n",
+			(unsigned long long) XFS_BB_TO_FSB(mp, d),
+			(unsigned long long) mp->m_sb.sb_rblocks);
+		return XFS_ERROR(E2BIG);
+	}
+	error = xfs_read_buf(mp, mp->m_rtdev_targp, d - 1, 1, 0, &bp);
+	if (error) {
+		printk(KERN_WARNING
+			"XFS: RT mount - xfs_read_buf returned %d\n", error);
+		if (error == ENOSPC)
+			return XFS_ERROR(E2BIG);
+		return error;
+	}
+	xfs_buf_relse(bp);
+	return 0;
+}
+
+/*
+ * Get the bitmap and summary inodes into the mount structure
+ * at mount time.
+ */
+int					/* error */
+xfs_rtmount_inodes(
+	xfs_mount_t	*mp)		/* file system mount structure */
+{
+	int		error;		/* error return value */
+	xfs_sb_t	*sbp;
+
+	sbp = &mp->m_sb;
+	if (sbp->sb_rbmino == NULLFSINO)
+		return 0;
+	error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, &mp->m_rbmip, 0);
+	if (error)
+		return error;
+	ASSERT(mp->m_rbmip != NULL);
+	ASSERT(sbp->sb_rsumino != NULLFSINO);
+	error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, &mp->m_rsumip, 0);
+	if (error) {
+		vnode_t		*rbmvp;		/* vnode for bitmap file */
+		vmap_t		vmap;		/* vmap to delete vnode */
+
+		rbmvp = XFS_ITOV(mp->m_rbmip);
+		VMAP(rbmvp, mp->m_rbmip, vmap);
+		VN_RELE(rbmvp);
+		vn_purge(rbmvp, &vmap);
+		return error;
+	}
+	ASSERT(mp->m_rsumip != NULL);
+	return 0;
+}
+
+/*
+ * Pick an extent for allocation at the start of a new realtime file.
+ * Use the sequence number stored in the atime field of the bitmap inode.
+ * Translate this to a fraction of the rtextents, and return the product
+ * of rtextents and the fraction.
+ * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
+ */
+int					/* error */
+xfs_rtpick_extent(
+	xfs_mount_t	*mp,		/* file system mount point */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_extlen_t	len,		/* allocation length (rtextents) */
+	xfs_rtblock_t	*pick)		/* result rt extent */
+{
+	xfs_rtblock_t	b;		/* result block */
+	int		error;		/* error return value */
+	xfs_inode_t	*ip;		/* bitmap incore inode */
+	int		log2;		/* log of sequence number */
+	__uint64_t	resid;		/* residual after log removed */
+	__uint64_t	seq;		/* sequence number of file creation */
+	__uint64_t	*seqp;		/* pointer to seqno in inode */
+
+	error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, XFS_ILOCK_EXCL, &ip);
+	if (error)
+		return error;
+	ASSERT(ip == mp->m_rbmip);
+	seqp = (__uint64_t *)&ip->i_d.di_atime;
+	if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
+		ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+		*seqp = 0;
+	}
+	seq = *seqp;
+	if ((log2 = xfs_highbit64(seq)) == -1)
+		b = 0;
+	else {
+		resid = seq - (1ULL << log2);
+		b = (mp->m_sb.sb_rextents * ((resid << 1) + 1ULL)) >>
+		    (log2 + 1);
+		if (b >= mp->m_sb.sb_rextents)
+			b = do_mod(b, mp->m_sb.sb_rextents);
+		if (b + len > mp->m_sb.sb_rextents)
+			b = mp->m_sb.sb_rextents - len;
+	}
+	*seqp = seq + 1;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	*pick = b;
+	return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Debug code: print out the value of a range in the bitmap.
+ */
+void
+xfs_rtprint_range(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp,		/* transaction pointer */
+	xfs_rtblock_t	start,		/* starting block to print */
+	xfs_extlen_t	len)		/* length to print */
+{
+	xfs_extlen_t	i;		/* block number in the extent */
+
+	printk("%Ld: ", (long long)start);
+	for (i = 0; i < len; i++)
+		printk("%d", xfs_rtcheck_bit(mp, tp, start + i, 1));
+	printk("\n");
+}
+
+/*
+ * Debug code: print the summary file.
+ */
+void
+xfs_rtprint_summary(
+	xfs_mount_t	*mp,		/* file system mount structure */
+	xfs_trans_t	*tp)		/* transaction pointer */
+{
+	xfs_suminfo_t	c;		/* summary data */
+	xfs_rtblock_t	i;		/* bitmap block number */
+	int		l;		/* summary information level */
+	int		p;		/* flag for printed anything */
+	xfs_fsblock_t	sb;		/* summary block number */
+	xfs_buf_t	*sumbp;		/* summary block buffer */
+
+	sumbp = NULL;
+	for (l = 0; l < mp->m_rsumlevels; l++) {
+		for (p = 0, i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
+			(void)xfs_rtget_summary(mp, tp, l, i, &sumbp, &sb, &c);
+			if (c) {
+				if (!p) {
+					printk("%Ld-%Ld:", 1LL << l,
+						XFS_RTMIN((1LL << l) +
+							  ((1LL << l) - 1LL),
+							 mp->m_sb.sb_rextents));
+					p = 1;
+				}
+				printk(" %Ld:%d", (long long)i, c);
+			}
+		}
+		if (p)
+			printk("\n");
+	}
+	if (sumbp)
+		xfs_trans_brelse(tp, sumbp);
+}
+#endif	/* DEBUG */
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
new file mode 100644
index 000000000000..286436f823d1
--- /dev/null
+++ b/fs/xfs/xfs_rtalloc.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_RTALLOC_H__
+#define __XFS_RTALLOC_H__
+
+struct xfs_mount;
+struct xfs_trans;
+
+/* Min and max rt extent sizes, specified in bytes */
+#define XFS_MAX_RTEXTSIZE	(1024 * 1024 * 1024)	/* 1GB */
+#define XFS_DFL_RTEXTSIZE	(64 * 1024)		/* 64KB */
+#define XFS_MIN_RTEXTSIZE	(4 * 1024)		/* 4KB */
+
+/*
+ * Constants for bit manipulations.
+ */
+#define XFS_NBBYLOG	3		/* log2(NBBY) */
+#define XFS_WORDLOG	2		/* log2(sizeof(xfs_rtword_t)) */
+#define XFS_NBWORDLOG	(XFS_NBBYLOG + XFS_WORDLOG)
+#define XFS_NBWORD	(1 << XFS_NBWORDLOG)
+#define XFS_WORDMASK	((1 << XFS_WORDLOG) - 1)
+
+#define XFS_BLOCKSIZE(mp)	((mp)->m_sb.sb_blocksize)
+#define XFS_BLOCKMASK(mp)	((mp)->m_blockmask)
+#define XFS_BLOCKWSIZE(mp)	((mp)->m_blockwsize)
+#define XFS_BLOCKWMASK(mp)	((mp)->m_blockwmask)
+
+/*
+ * Summary and bit manipulation macros.
+ */
+#define XFS_SUMOFFS(mp,ls,bb)	((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
+#define XFS_SUMOFFSTOBLOCK(mp,s)	\
+	(((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
+#define XFS_SUMPTR(mp,bp,so)	\
+	((xfs_suminfo_t *)((char *)XFS_BUF_PTR(bp) + \
+		(((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
+
+#define XFS_BITTOBLOCK(mp,bi)	((bi) >> (mp)->m_blkbit_log)
+#define XFS_BLOCKTOBIT(mp,bb)	((bb) << (mp)->m_blkbit_log)
+#define XFS_BITTOWORD(mp,bi)	\
+	((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
+
+#define XFS_RTMIN(a,b)	((a) < (b) ? (a) : (b))
+#define XFS_RTMAX(a,b)	((a) > (b) ? (a) : (b))
+
+#define XFS_RTLOBIT(w)	xfs_lowbit32(w)
+#define XFS_RTHIBIT(w)	xfs_highbit32(w)
+
+#if XFS_BIG_FILESYSTEMS
+#define XFS_RTBLOCKLOG(b)	xfs_highbit64(b)
+#else
+#define XFS_RTBLOCKLOG(b)	xfs_highbit32(b)
+#endif
+
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_XFS_RT
+/*
+ * Function prototypes for exported functions.
+ */
+
+/*
+ * Allocate an extent in the realtime subvolume, with the usual allocation
+ * parameters.	The length units are all in realtime extents, as is the
+ * result block number.
+ */
+int					/* error */
+xfs_rtallocate_extent(
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_rtblock_t		bno,	/* starting block number to allocate */
+	xfs_extlen_t		minlen, /* minimum length to allocate */
+	xfs_extlen_t		maxlen, /* maximum length to allocate */
+	xfs_extlen_t		*len,	/* out: actual length allocated */
+	xfs_alloctype_t		type,	/* allocation type XFS_ALLOCTYPE... */
+	int			wasdel, /* was a delayed allocation extent */
+	xfs_extlen_t		prod,	/* extent product factor */
+	xfs_rtblock_t		*rtblock); /* out: start block allocated */
+
+/*
+ * Free an extent in the realtime subvolume.  Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int					/* error */
+xfs_rtfree_extent(
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_rtblock_t		bno,	/* starting block number to free */
+	xfs_extlen_t		len);	/* length of extent freed */
+
+/*
+ * Initialize realtime fields in the mount structure.
+ */
+int					/* error */
+xfs_rtmount_init(
+	struct xfs_mount	*mp);	/* file system mount structure */
+
+/*
+ * Get the bitmap and summary inodes into the mount structure
+ * at mount time.
+ */
+int					/* error */
+xfs_rtmount_inodes(
+	struct xfs_mount	*mp);	/* file system mount structure */
+
+/*
+ * Pick an extent for allocation at the start of a new realtime file.
+ * Use the sequence number stored in the atime field of the bitmap inode.
+ * Translate this to a fraction of the rtextents, and return the product
+ * of rtextents and the fraction.
+ * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
+ */
+int					/* error */
+xfs_rtpick_extent(
+	struct xfs_mount	*mp,	/* file system mount point */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_extlen_t		len,	/* allocation length (rtextents) */
+	xfs_rtblock_t		*pick); /* result rt extent */
+
+/*
+ * Debug code: print out the value of a range in the bitmap.
+ */
+void
+xfs_rtprint_range(
+	struct xfs_mount	*mp,	/* file system mount structure */
+	struct xfs_trans	*tp,	/* transaction pointer */
+	xfs_rtblock_t		start,	/* starting block to print */
+	xfs_extlen_t		len);	/* length to print */
+
+/*
+ * Debug code: print the summary file.
+ */
+void
+xfs_rtprint_summary(
+	struct xfs_mount	*mp,	/* file system mount structure */
+	struct xfs_trans	*tp);	/* transaction pointer */
+
+/*
+ * Grow the realtime area of the filesystem.
+ */
+int
+xfs_growfs_rt(
+	struct xfs_mount	*mp,	/* file system mount structure */
+	xfs_growfs_rt_t		*in);	/* user supplied growfs struct */
+
+#else
+# define xfs_rtallocate_extent(t,b,min,max,l,a,f,p,rb)	(ENOSYS)
+# define xfs_rtfree_extent(t,b,l)			(ENOSYS)
+# define xfs_rtpick_extent(m,t,l,rb)			(ENOSYS)
+# define xfs_growfs_rt(mp,in)				(ENOSYS)
+# define xfs_rtmount_init(m)	(((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+# define xfs_rtmount_inodes(m)	(((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+#endif	/* CONFIG_XFS_RT */
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_RTALLOC_H__ */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
new file mode 100644
index 000000000000..0076a84485e8
--- /dev/null
+++ b/fs/xfs/xfs_rw.c
@@ -0,0 +1,753 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+/*
+ * This is a subroutine for xfs_write() and other writers (xfs_ioctl)
+ * which clears the setuid and setgid bits when a file is written.
+ */
+int
+xfs_write_clear_setuid(
+	xfs_inode_t	*ip)
+{
+	xfs_mount_t	*mp;
+	xfs_trans_t	*tp;
+	int		error;
+
+	mp = ip->i_mount;
+	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
+	if ((error = xfs_trans_reserve(tp, 0,
+				      XFS_WRITEID_LOG_RES(mp),
+				      0, 0, 0))) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ihold(tp, ip);
+	ip->i_d.di_mode &= ~ISUID;
+
+	/*
+	 * Note that we don't have to worry about mandatory
+	 * file locking being disabled here because we only
+	 * clear the ISGID bit if the Group execute bit is
+	 * on, but if it was on then mandatory locking wouldn't
+	 * have been enabled.
+	 */
+	if (ip->i_d.di_mode & (IEXEC >> 3)) {
+		ip->i_d.di_mode &= ~ISGID;
+	}
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp, 0, NULL);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return 0;
+}
+
+/*
+ * Force a shutdown of the filesystem instantly while keeping
+ * the filesystem consistent. We don't do an unmount here; just shutdown
+ * the shop, make sure that absolutely nothing persistent happens to
+ * this filesystem after this point.
+ */
+
+void
+xfs_do_force_shutdown(
+	bhv_desc_t	*bdp,
+	int		flags,
+	char		*fname,
+	int		lnnum)
+{
+	int		logerror;
+	xfs_mount_t	*mp;
+
+	mp = XFS_BHVTOM(bdp);
+	logerror = flags & XFS_LOG_IO_ERROR;
+
+	if (!(flags & XFS_FORCE_UMOUNT)) {
+		cmn_err(CE_NOTE,
+		"xfs_force_shutdown(%s,0x%x) called from line %d of file %s.  Return address = 0x%x",
+			mp->m_fsname,flags,lnnum,fname,__return_address);
+	}
+	/*
+	 * No need to duplicate efforts.
+	 */
+	if (XFS_FORCED_SHUTDOWN(mp) && !logerror)
+		return;
+
+	/*
+	 * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
+	 * queue up anybody new on the log reservations, and wakes up
+	 * everybody who's sleeping on log reservations and tells
+	 * them the bad news.
+	 */
+	if (xfs_log_force_umount(mp, logerror))
+		return;
+
+	if (flags & XFS_CORRUPT_INCORE) {
+		cmn_err(CE_ALERT,
+    "Corruption of in-memory data detected.  Shutting down filesystem: %s",
+			mp->m_fsname);
+	} else if (!(flags & XFS_FORCE_UMOUNT)) {
+		if (logerror) {
+			cmn_err(CE_ALERT,
+			"Log I/O Error Detected.  Shutting down filesystem: %s",
+				mp->m_fsname);
+		} else if (!(flags & XFS_SHUTDOWN_REMOTE_REQ)) {
+			cmn_err(CE_ALERT,
+				"I/O Error Detected.  Shutting down filesystem: %s",
+				mp->m_fsname);
+		}
+	}
+	if (!(flags & XFS_FORCE_UMOUNT)) {
+		cmn_err(CE_ALERT,
+		"Please umount the filesystem, and rectify the problem(s)");
+	}
+}
+
+
+/*
+ * Called when we want to stop a buffer from getting written or read.
+ * We attach the EIO error, muck with its flags, and call biodone
+ * so that the proper iodone callbacks get called.
+ */
+int
+xfs_bioerror(
+	xfs_buf_t *bp)
+{
+
+#ifdef XFSERRORDEBUG
+	ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
+#endif
+
+	/*
+	 * No need to wait until the buffer is unpinned.
+	 * We aren't flushing it.
+	 */
+	xfs_buftrace("XFS IOERROR", bp);
+	XFS_BUF_ERROR(bp, EIO);
+	/*
+	 * We're calling biodone, so delete B_DONE flag. Either way
+	 * we have to call the iodone callback, and calling biodone
+	 * probably is the best way since it takes care of
+	 * GRIO as well.
+	 */
+	XFS_BUF_UNREAD(bp);
+	XFS_BUF_UNDELAYWRITE(bp);
+	XFS_BUF_UNDONE(bp);
+	XFS_BUF_STALE(bp);
+
+	XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+	xfs_biodone(bp);
+
+	return (EIO);
+}
+
+/*
+ * Same as xfs_bioerror, except that we are releasing the buffer
+ * here ourselves, and avoiding the biodone call.
+ * This is meant for userdata errors; metadata bufs come with
+ * iodone functions attached, so that we can track down errors.
+ */
+int
+xfs_bioerror_relse(
+	xfs_buf_t *bp)
+{
+	int64_t fl;
+
+	ASSERT(XFS_BUF_IODONE_FUNC(bp) != xfs_buf_iodone_callbacks);
+	ASSERT(XFS_BUF_IODONE_FUNC(bp) != xlog_iodone);
+
+	xfs_buftrace("XFS IOERRELSE", bp);
+	fl = XFS_BUF_BFLAGS(bp);
+	/*
+	 * No need to wait until the buffer is unpinned.
+	 * We aren't flushing it.
+	 *
+	 * chunkhold expects B_DONE to be set, whether
+	 * we actually finish the I/O or not. We don't want to
+	 * change that interface.
+	 */
+	XFS_BUF_UNREAD(bp);
+	XFS_BUF_UNDELAYWRITE(bp);
+	XFS_BUF_DONE(bp);
+	XFS_BUF_STALE(bp);
+	XFS_BUF_CLR_IODONE_FUNC(bp);
+	XFS_BUF_CLR_BDSTRAT_FUNC(bp);
+	if (!(fl & XFS_B_ASYNC)) {
+		/*
+		 * Mark b_error and B_ERROR _both_.
+		 * Lot's of chunkcache code assumes that.
+		 * There's no reason to mark error for
+		 * ASYNC buffers.
+		 */
+		XFS_BUF_ERROR(bp, EIO);
+		XFS_BUF_V_IODONESEMA(bp);
+	} else {
+		xfs_buf_relse(bp);
+	}
+	return (EIO);
+}
+/*
+ * Prints out an ALERT message about I/O error.
+ */
+void
+xfs_ioerror_alert(
+	char			*func,
+	struct xfs_mount	*mp,
+	xfs_buf_t		*bp,
+	xfs_daddr_t		blkno)
+{
+	cmn_err(CE_ALERT,
+ "I/O error in filesystem (\"%s\") meta-data dev 0x%x block 0x%llx\n"
+ "	 (\"%s\") error %d buf count %u",
+		(!mp || !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname,
+		XFS_BUF_TARGET_DEV(bp),
+		(__uint64_t)blkno,
+		func,
+		XFS_BUF_GETERROR(bp),
+		XFS_BUF_COUNT(bp));
+}
+
+/*
+ * This isn't an absolute requirement, but it is
+ * just a good idea to call xfs_read_buf instead of
+ * directly doing a read_buf call. For one, we shouldn't
+ * be doing this disk read if we are in SHUTDOWN state anyway,
+ * so this stops that from happening. Secondly, this does all
+ * the error checking stuff and the brelse if appropriate for
+ * the caller, so the code can be a little leaner.
+ */
+
+int
+xfs_read_buf(
+	struct xfs_mount *mp,
+	xfs_buftarg_t	 *target,
+	xfs_daddr_t	 blkno,
+	int		 len,
+	uint		 flags,
+	xfs_buf_t	 **bpp)
+{
+	xfs_buf_t	 *bp;
+	int		 error;
+
+	if (flags)
+		bp = xfs_buf_read_flags(target, blkno, len, flags);
+	else
+		bp = xfs_buf_read(target, blkno, len, flags);
+	if (!bp)
+		return XFS_ERROR(EIO);
+	error = XFS_BUF_GETERROR(bp);
+	if (bp && !error && !XFS_FORCED_SHUTDOWN(mp)) {
+		*bpp = bp;
+	} else {
+		*bpp = NULL;
+		if (error) {
+			xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp));
+		} else {
+			error = XFS_ERROR(EIO);
+		}
+		if (bp) {
+			XFS_BUF_UNDONE(bp);
+			XFS_BUF_UNDELAYWRITE(bp);
+			XFS_BUF_STALE(bp);
+			/*
+			 * brelse clears B_ERROR and b_error
+			 */
+			xfs_buf_relse(bp);
+		}
+	}
+	return (error);
+}
+
+/*
+ * Wrapper around bwrite() so that we can trap
+ * write errors, and act accordingly.
+ */
+int
+xfs_bwrite(
+	struct xfs_mount *mp,
+	struct xfs_buf	 *bp)
+{
+	int	error;
+
+	/*
+	 * XXXsup how does this work for quotas.
+	 */
+	XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
+	XFS_BUF_SET_FSPRIVATE3(bp, mp);
+	XFS_BUF_WRITE(bp);
+
+	if ((error = XFS_bwrite(bp))) {
+		ASSERT(mp);
+		/*
+		 * Cannot put a buftrace here since if the buffer is not
+		 * B_HOLD then we will brelse() the buffer before returning
+		 * from bwrite and we could be tracing a buffer that has
+		 * been reused.
+		 */
+		xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
+	}
+	return (error);
+}
+
+/*
+ * xfs_inval_cached_pages()
+ * This routine is responsible for keeping direct I/O and buffered I/O
+ * somewhat coherent.  From here we make sure that we're at least
+ * temporarily holding the inode I/O lock exclusively and then call
+ * the page cache to flush and invalidate any cached pages.  If there
+ * are no cached pages this routine will be very quick.
+ */
+void
+xfs_inval_cached_pages(
+	vnode_t		*vp,
+	xfs_iocore_t	*io,
+	xfs_off_t	offset,
+	int		write,
+	int		relock)
+{
+	xfs_mount_t	*mp;
+
+	if (!VN_CACHED(vp)) {
+		return;
+	}
+
+	mp = io->io_mount;
+
+	/*
+	 * We need to get the I/O lock exclusively in order
+	 * to safely invalidate pages and mappings.
+	 */
+	if (relock) {
+		XFS_IUNLOCK(mp, io, XFS_IOLOCK_SHARED);
+		XFS_ILOCK(mp, io, XFS_IOLOCK_EXCL);
+	}
+
+	/* Writing beyond EOF creates a hole that must be zeroed */
+	if (write && (offset > XFS_SIZE(mp, io))) {
+		xfs_fsize_t	isize;
+
+		XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+		isize = XFS_SIZE(mp, io);
+		if (offset > isize) {
+			xfs_zero_eof(vp, io, offset, isize, offset, NULL);
+		}
+		XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
+	}
+
+	VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(offset)), -1, FI_REMAPF_LOCKED);
+	if (relock) {
+		XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL);
+	}
+}
+
+
+
+spinlock_t	xfs_refcache_lock = SPIN_LOCK_UNLOCKED;
+xfs_inode_t	**xfs_refcache;
+int		xfs_refcache_size;
+int		xfs_refcache_index;
+int		xfs_refcache_busy;
+int		xfs_refcache_count;
+
+/*
+ * Timer callback to mark SB dirty, make sure we keep purging refcache
+ */
+void
+xfs_refcache_sbdirty(struct super_block *sb)
+{
+	sb->s_dirt = 1;
+}
+
+/*
+ * Insert the given inode into the reference cache.
+ */
+void
+xfs_refcache_insert(
+	xfs_inode_t	*ip)
+{
+	vnode_t		*vp;
+	xfs_inode_t	*release_ip;
+	xfs_inode_t	**refcache;
+
+	ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE));
+
+	/*
+	 * If an unmount is busy blowing entries out of the cache,
+	 * then don't bother.
+	 */
+	if (xfs_refcache_busy) {
+		return;
+	}
+
+	/*
+	 * If we tuned the refcache down to zero, don't do anything.
+	 */
+	 if (!xfs_refcache_size) {
+		return;
+	}
+
+	/*
+	 * The inode is already in the refcache, so don't bother
+	 * with it.
+	 */
+	if (ip->i_refcache != NULL) {
+		return;
+	}
+
+	vp = XFS_ITOV(ip);
+	/* ASSERT(vp->v_count > 0); */
+	VN_HOLD(vp);
+
+	/*
+	 * We allocate the reference cache on use so that we don't
+	 * waste the memory on systems not being used as NFS servers.
+	 */
+	if (xfs_refcache == NULL) {
+		refcache = (xfs_inode_t **)kmem_zalloc(XFS_REFCACHE_SIZE_MAX *
+						       sizeof(xfs_inode_t *),
+						       KM_SLEEP);
+	} else {
+		refcache = NULL;
+	}
+
+	spin_lock(&xfs_refcache_lock);
+
+	/*
+	 * If we allocated memory for the refcache above and it still
+	 * needs it, then use the memory we allocated.	Otherwise we'll
+	 * free the memory below.
+	 */
+	if (refcache != NULL) {
+		if (xfs_refcache == NULL) {
+			xfs_refcache = refcache;
+			refcache = NULL;
+		}
+	}
+
+	/*
+	 * If an unmount is busy clearing out the cache, don't add new
+	 * entries to it.
+	 */
+	if (xfs_refcache_busy) {
+		spin_unlock(&xfs_refcache_lock);
+		VN_RELE(vp);
+		/*
+		 * If we allocated memory for the refcache above but someone
+		 * else beat us to using it, then free the memory now.
+		 */
+		if (refcache != NULL) {
+			kmem_free(refcache,
+				  XFS_REFCACHE_SIZE_MAX * sizeof(xfs_inode_t *));
+		}
+		return;
+	}
+	release_ip = xfs_refcache[xfs_refcache_index];
+	if (release_ip != NULL) {
+		release_ip->i_refcache = NULL;
+		xfs_refcache_count--;
+		ASSERT(xfs_refcache_count >= 0);
+	}
+	xfs_refcache[xfs_refcache_index] = ip;
+	ASSERT(ip->i_refcache == NULL);
+	ip->i_refcache = &(xfs_refcache[xfs_refcache_index]);
+	xfs_refcache_count++;
+	ASSERT(xfs_refcache_count <= xfs_refcache_size);
+	xfs_refcache_index++;
+	if (xfs_refcache_index == xfs_refcache_size) {
+		xfs_refcache_index = 0;
+	}
+	spin_unlock(&xfs_refcache_lock);
+
+	/*
+	 * Save the pointer to the inode to be released so that we can
+	 * VN_RELE it once we've dropped our inode locks in xfs_rwunlock().
+	 * The pointer may be NULL, but that's OK.
+	 */
+	ip->i_release = release_ip;
+
+	/*
+	 * If we allocated memory for the refcache above but someone
+	 * else beat us to using it, then free the memory now.
+	 */
+	if (refcache != NULL) {
+		kmem_free(refcache,
+			  XFS_REFCACHE_SIZE_MAX * sizeof(xfs_inode_t *));
+	}
+	return;
+}
+
+
+/*
+ * If the given inode is in the reference cache, purge its entry and
+ * release the reference on the vnode.
+ */
+void
+xfs_refcache_purge_ip(
+	xfs_inode_t	*ip)
+{
+	vnode_t *vp;
+	int	error;
+
+	/*
+	 * If we're not pointing to our entry in the cache, then
+	 * we must not be in the cache.
+	 */
+	if (ip->i_refcache == NULL) {
+		return;
+	}
+
+	spin_lock(&xfs_refcache_lock);
+	if (ip->i_refcache == NULL) {
+		spin_unlock(&xfs_refcache_lock);
+		return;
+	}
+
+	/*
+	 * Clear both our pointer to the cache entry and its pointer
+	 * back to us.
+	 */
+	ASSERT(*(ip->i_refcache) == ip);
+	*(ip->i_refcache) = NULL;
+	ip->i_refcache = NULL;
+	xfs_refcache_count--;
+	ASSERT(xfs_refcache_count >= 0);
+	spin_unlock(&xfs_refcache_lock);
+
+	vp = XFS_ITOV(ip);
+	/* ASSERT(vp->v_count > 1); */
+	VOP_RELEASE(vp, error);
+	VN_RELE(vp);
+
+	return;
+}
+
+
+/*
+ * This is called from the XFS unmount code to purge all entries for the
+ * given mount from the cache.	It uses the refcache busy counter to
+ * make sure that new entries are not added to the cache as we purge them.
+ */
+void
+xfs_refcache_purge_mp(
+	xfs_mount_t	*mp)
+{
+	vnode_t		*vp;
+	int		error, i;
+	xfs_inode_t	*ip;
+
+	if (xfs_refcache == NULL) {
+		return;
+	}
+
+	spin_lock(&xfs_refcache_lock);
+	/*
+	 * Bumping the busy counter keeps new entries from being added
+	 * to the cache.  We use a counter since multiple unmounts could
+	 * be in here simultaneously.
+	 */
+	xfs_refcache_busy++;
+
+	for (i = 0; i < xfs_refcache_size; i++) {
+		ip = xfs_refcache[i];
+		if ((ip != NULL) && (ip->i_mount == mp)) {
+			xfs_refcache[i] = NULL;
+			ip->i_refcache = NULL;
+			xfs_refcache_count--;
+			ASSERT(xfs_refcache_count >= 0);
+			spin_unlock(&xfs_refcache_lock);
+			vp = XFS_ITOV(ip);
+			VOP_RELEASE(vp, error);
+			VN_RELE(vp);
+			spin_lock(&xfs_refcache_lock);
+		}
+	}
+
+	xfs_refcache_busy--;
+	ASSERT(xfs_refcache_busy >= 0);
+	spin_unlock(&xfs_refcache_lock);
+}
+
+
+/*
+ * This is called from the XFS sync code to ensure that the refcache
+ * is emptied out over time.  We purge a small number of entries with
+ * each call.
+ */
+void
+xfs_refcache_purge_some(xfs_mount_t *mp)
+{
+	int		error, i;
+	xfs_inode_t	*ip;
+	int		iplist_index;
+	xfs_inode_t	**iplist;
+	int		purge_count;
+
+	if ((xfs_refcache == NULL) || (xfs_refcache_count == 0)) {
+		return;
+	}
+
+	iplist_index = 0;
+	purge_count = xfs_params.refcache_purge;
+	iplist = (xfs_inode_t **)kmem_zalloc(purge_count *
+					  sizeof(xfs_inode_t *), KM_SLEEP);
+
+	spin_lock(&xfs_refcache_lock);
+
+	/*
+	 * Store any inodes we find in the next several entries
+	 * into the iplist array to be released after dropping
+	 * the spinlock.  We always start looking from the currently
+	 * oldest place in the cache.  We move the refcache index
+	 * forward as we go so that we are sure to eventually clear
+	 * out the entire cache when the system goes idle.
+	 */
+	for (i = 0; i < purge_count; i++) {
+		ip = xfs_refcache[xfs_refcache_index];
+		if (ip != NULL) {
+			xfs_refcache[xfs_refcache_index] = NULL;
+			ip->i_refcache = NULL;
+			xfs_refcache_count--;
+			ASSERT(xfs_refcache_count >= 0);
+			iplist[iplist_index] = ip;
+			iplist_index++;
+		}
+		xfs_refcache_index++;
+		if (xfs_refcache_index == xfs_refcache_size) {
+			xfs_refcache_index = 0;
+		}
+	}
+
+	spin_unlock(&xfs_refcache_lock);
+
+	/*
+	 * If there are still entries in the refcache,
+	 * set timer to mark the SB dirty to make sure that
+	 * we hit sync even if filesystem is idle, so that we'll
+	 * purge some more later.
+	 */
+	if (xfs_refcache_count) {
+		del_timer_sync(&mp->m_sbdirty_timer);
+		mp->m_sbdirty_timer.data =
+		    (unsigned long)LINVFS_GET_IP(XFS_ITOV(mp->m_rootip))->i_sb;
+		mp->m_sbdirty_timer.expires = jiffies + 2*HZ;
+		add_timer(&mp->m_sbdirty_timer);
+	}
+
+	/*
+	 * Now drop the inodes we collected.
+	 */
+	for (i = 0; i < iplist_index; i++) {
+		VOP_RELEASE(XFS_ITOV(iplist[i]), error);
+		VN_RELE(XFS_ITOV(iplist[i]));
+	}
+
+	kmem_free(iplist, purge_count *
+			  sizeof(xfs_inode_t *));
+}
+
+/*
+ * This is called when the refcache is dynamically resized
+ * via a sysctl.
+ *
+ * If the new size is smaller than the old size, purge all
+ * entries in slots greater than the new size, and move
+ * the index if necessary.
+ *
+ * If the refcache hasn't even been allocated yet, or the
+ * new size is larger than the old size, just set the value
+ * of xfs_refcache_size.
+ */
+
+void
+xfs_refcache_resize(int xfs_refcache_new_size)
+{
+	int		i;
+	xfs_inode_t	*ip;
+	int		iplist_index = 0;
+	xfs_inode_t	**iplist;
+	int		error;
+
+	/*
+	 * If the new size is smaller than the current size,
+	 * purge entries to create smaller cache, and
+	 * reposition index if necessary.
+	 * Don't bother if no refcache yet.
+	 */
+	if (xfs_refcache && (xfs_refcache_new_size < xfs_refcache_size)) {
+
+		iplist = (xfs_inode_t **)kmem_zalloc(XFS_REFCACHE_SIZE_MAX *
+				sizeof(xfs_inode_t *), KM_SLEEP);
+
+		spin_lock(&xfs_refcache_lock);
+
+		for (i = xfs_refcache_new_size; i < xfs_refcache_size; i++) {
+			ip = xfs_refcache[i];
+			if (ip != NULL) {
+				xfs_refcache[i] = NULL;
+				ip->i_refcache = NULL;
+				xfs_refcache_count--;
+				ASSERT(xfs_refcache_count >= 0);
+				iplist[iplist_index] = ip;
+				iplist_index++;
+			}
+		}
+
+		xfs_refcache_size = xfs_refcache_new_size;
+
+		/*
+		 * Move index to beginning of cache if it's now past the end
+		 */
+		if (xfs_refcache_index >= xfs_refcache_new_size)
+			xfs_refcache_index = 0;
+
+		spin_unlock(&xfs_refcache_lock);
+
+		/*
+		 * Now drop the inodes we collected.
+		 */
+		for (i = 0; i < iplist_index; i++) {
+			VOP_RELEASE(XFS_ITOV(iplist[i]), error);
+			VN_RELE(XFS_ITOV(iplist[i]));
+		}
+
+		kmem_free(iplist, XFS_REFCACHE_SIZE_MAX *
+				  sizeof(xfs_inode_t *));
+	} else {
+		spin_lock(&xfs_refcache_lock);
+		xfs_refcache_size = xfs_refcache_new_size;
+		spin_unlock(&xfs_refcache_lock);
+	}
+}
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
new file mode 100644
index 000000000000..f2f4c5d88738
--- /dev/null
+++ b/fs/xfs/xfs_rw.h
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_RW_H__
+#define __XFS_RW_H__
+
+struct bhv_desc;
+struct bmapval;
+struct xfs_buf;
+struct cred;
+struct uio;
+struct vnode;
+struct xfs_inode;
+struct xfs_iocore;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dio;
+struct pm;
+
+/*
+ * Maximum count of bmaps used by read and write paths.
+ */
+#define XFS_MAX_RW_NBMAPS	4
+
+/*
+ * Counts of readahead buffers to use based on physical memory size.
+ * None of these should be more than XFS_MAX_RW_NBMAPS.
+ */
+#define XFS_RW_NREADAHEAD_16MB	2
+#define XFS_RW_NREADAHEAD_32MB	3
+#define XFS_RW_NREADAHEAD_K32	4
+#define XFS_RW_NREADAHEAD_K64	4
+
+/*
+ * Maximum size of a buffer that we\'ll map.  Making this
+ * too big will degrade performance due to the number of
+ * pages which need to be gathered.  Making it too small
+ * will prevent us from doing large I/O\'s to hardware that
+ * needs it.
+ *
+ * This is currently set to 512 KB.
+ */
+#define XFS_MAX_BMAP_LEN_BB	1024
+#define XFS_MAX_BMAP_LEN_BYTES	524288
+
+/*
+ * Maximum size (in inodes) for the nfs refcache
+ */
+#define XFS_REFCACHE_SIZE_MAX	512
+
+
+/*
+ * Convert the given file system block to a disk block.
+ * We have to treat it differently based on whether the
+ * file is a real time file or not, because the bmap code
+ * does.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_TO_DB)
+xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
+#define XFS_FSB_TO_DB(ip,fsb)	xfs_fsb_to_db(ip,fsb)
+#else
+#define XFS_FSB_TO_DB(ip,fsb) \
+		(((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) ? \
+		 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
+		 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)))
+#endif
+
+#define XFS_FSB_TO_DB_IO(io,fsb) \
+		(((io)->io_flags & XFS_IOCORE_RT) ? \
+		 XFS_FSB_TO_BB((io)->io_mount, (fsb)) : \
+		 XFS_FSB_TO_DADDR((io)->io_mount, (fsb)))
+
+/*
+ * Defines for the trace mechanisms in xfs_rw.c.
+ */
+#define XFS_RW_KTRACE_SIZE	64
+#define XFS_STRAT_KTRACE_SIZE	64
+#define XFS_STRAT_GTRACE_SIZE	512
+
+#define XFS_READ_ENTER		1
+#define XFS_WRITE_ENTER		2
+#define XFS_IOMAP_READ_ENTER	3
+#define XFS_IOMAP_WRITE_ENTER	4
+#define XFS_IOMAP_READ_MAP	5
+#define XFS_IOMAP_WRITE_MAP	6
+#define XFS_IOMAP_WRITE_NOSPACE 7
+#define XFS_ITRUNC_START	8
+#define XFS_ITRUNC_FINISH1	9
+#define XFS_ITRUNC_FINISH2	10
+#define XFS_CTRUNC1		11
+#define XFS_CTRUNC2		12
+#define XFS_CTRUNC3		13
+#define XFS_CTRUNC4		14
+#define XFS_CTRUNC5		15
+#define XFS_CTRUNC6		16
+#define XFS_BUNMAPI		17
+#define XFS_INVAL_CACHED	18
+#define XFS_DIORD_ENTER		19
+#define XFS_DIOWR_ENTER		20
+
+#if defined(XFS_ALL_TRACE)
+#define XFS_RW_TRACE
+#define XFS_STRAT_TRACE
+#endif
+
+#if !defined(DEBUG)
+#undef XFS_RW_TRACE
+#undef XFS_STRAT_TRACE
+#endif
+
+/*
+ * Prototypes for functions in xfs_rw.c.
+ */
+
+int
+xfs_write_clear_setuid(
+	struct xfs_inode	*ip);
+
+int
+xfs_bwrite(
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp);
+
+void
+xfs_inval_cached_pages(
+	struct vnode		*vp,
+	struct xfs_iocore	*io,
+	xfs_off_t		offset,
+	int			write,
+	int			relock);
+
+void
+xfs_refcache_insert(
+	struct xfs_inode	*ip);
+
+void
+xfs_refcache_purge_ip(
+	struct xfs_inode	*ip);
+
+void
+xfs_refcache_purge_mp(
+	struct xfs_mount	*mp);
+
+void
+xfs_refcache_purge_some(
+	struct xfs_mount	*mp);
+
+void
+xfs_refcache_resize(
+	int			xfs_refcache_new_size);
+
+int
+xfs_bioerror(
+	struct xfs_buf		*b);
+
+/*
+ * XFS I/O core functions
+ */
+extern int xfs_bioerror_relse(struct xfs_buf *);
+
+
+/*
+ * Needed by xfs_rw.c
+ */
+int
+xfs_rwlock(
+	bhv_desc_t	*bdp,
+	vrwlock_t	write_lock);
+
+void
+xfs_rwunlock(
+	bhv_desc_t	*bdp,
+	vrwlock_t	write_lock);
+
+int
+xfs_read_buf(
+	struct xfs_mount *mp,
+	xfs_buftarg_t	 *target,
+	xfs_daddr_t	 blkno,
+	int		 len,
+	uint		 flags,
+	struct xfs_buf	 **bpp);
+
+void
+xfs_ioerror_alert(
+	char			*func,
+	struct xfs_mount	*mp,
+	xfs_buf_t		*bp,
+	xfs_daddr_t		blkno);
+
+#endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
new file mode 100644
index 000000000000..729b61c81403
--- /dev/null
+++ b/fs/xfs/xfs_sb.h
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2000-2001 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_SB_H__
+#define __XFS_SB_H__
+
+/*
+ * Super block
+ * Fits into a 512-byte buffer at daddr_t 0 of each allocation group.
+ * Only the first of these is ever updated except during growfs.
+ */
+
+struct xfs_buf;
+struct xfs_mount;
+
+#define XFS_SB_MAGIC		0x58465342	/* 'XFSB' */
+#define XFS_SB_VERSION_1	1		/* 5.3, 6.0.1, 6.1 */
+#define XFS_SB_VERSION_2	2		/* 6.2 - attributes */
+#define XFS_SB_VERSION_3	3		/* 6.2 - new inode version */
+#define XFS_SB_VERSION_4	4		/* 6.2+ - bitmask version */
+#define XFS_SB_VERSION_NUMBITS		0x000f
+#define XFS_SB_VERSION_ALLFBITS		0xfff0
+#define XFS_SB_VERSION_SASHFBITS	0xf000
+#define XFS_SB_VERSION_REALFBITS	0x0ff0
+#define XFS_SB_VERSION_ATTRBIT		0x0010
+#define XFS_SB_VERSION_NLINKBIT		0x0020
+#define XFS_SB_VERSION_QUOTABIT		0x0040
+#define XFS_SB_VERSION_ALIGNBIT		0x0080
+#define XFS_SB_VERSION_DALIGNBIT	0x0100
+#define XFS_SB_VERSION_SHAREDBIT	0x0200
+#define XFS_SB_VERSION_LOGV2BIT		0x0400
+#define XFS_SB_VERSION_EXTFLGBIT	0x1000
+#define XFS_SB_VERSION_DIRV2BIT		0x2000
+#define XFS_SB_VERSION_OKSASHFBITS	\
+	(XFS_SB_VERSION_EXTFLGBIT | \
+	 XFS_SB_VERSION_DIRV2BIT)
+#define XFS_SB_VERSION_OKREALFBITS	\
+	(XFS_SB_VERSION_ATTRBIT | \
+	 XFS_SB_VERSION_NLINKBIT | \
+	 XFS_SB_VERSION_QUOTABIT | \
+	 XFS_SB_VERSION_ALIGNBIT | \
+	 XFS_SB_VERSION_DALIGNBIT | \
+	 XFS_SB_VERSION_SHAREDBIT | \
+	 XFS_SB_VERSION_LOGV2BIT)
+#define XFS_SB_VERSION_OKSASHBITS	\
+	(XFS_SB_VERSION_NUMBITS | \
+	 XFS_SB_VERSION_REALFBITS | \
+	 XFS_SB_VERSION_OKSASHFBITS)
+#define XFS_SB_VERSION_OKREALBITS	\
+	(XFS_SB_VERSION_NUMBITS | \
+	 XFS_SB_VERSION_OKREALFBITS | \
+	 XFS_SB_VERSION_OKSASHFBITS)
+#define XFS_SB_VERSION_MKFS(ia,dia,extflag,dirv2,na)	\
+	(((ia) || (dia) || (extflag) || (dirv2) || (na)) ? \
+		(XFS_SB_VERSION_4 | \
+		 ((ia) ? XFS_SB_VERSION_ALIGNBIT : 0) | \
+		 ((dia) ? XFS_SB_VERSION_DALIGNBIT : 0) | \
+		 ((extflag) ? XFS_SB_VERSION_EXTFLGBIT : 0) | \
+		 ((dirv2) ? XFS_SB_VERSION_DIRV2BIT : 0) | \
+		 ((na) ? XFS_SB_VERSION_LOGV2BIT : 0)) : \
+		XFS_SB_VERSION_1)
+
+typedef struct xfs_sb
+{
+	__uint32_t	sb_magicnum;	/* magic number == XFS_SB_MAGIC */
+	__uint32_t	sb_blocksize;	/* logical block size, bytes */
+	xfs_drfsbno_t	sb_dblocks;	/* number of data blocks */
+	xfs_drfsbno_t	sb_rblocks;	/* number of realtime blocks */
+	xfs_drtbno_t	sb_rextents;	/* number of realtime extents */
+	uuid_t		sb_uuid;	/* file system unique id */
+	xfs_dfsbno_t	sb_logstart;	/* starting block of log if internal */
+	xfs_ino_t	sb_rootino;	/* root inode number */
+	xfs_ino_t	sb_rbmino;	/* bitmap inode for realtime extents */
+	xfs_ino_t	sb_rsumino;	/* summary inode for rt bitmap */
+	xfs_agblock_t	sb_rextsize;	/* realtime extent size, blocks */
+	xfs_agblock_t	sb_agblocks;	/* size of an allocation group */
+	xfs_agnumber_t	sb_agcount;	/* number of allocation groups */
+	xfs_extlen_t	sb_rbmblocks;	/* number of rt bitmap blocks */
+	xfs_extlen_t	sb_logblocks;	/* number of log blocks */
+	__uint16_t	sb_versionnum;	/* header version == XFS_SB_VERSION */
+	__uint16_t	sb_sectsize;	/* volume sector size, bytes */
+	__uint16_t	sb_inodesize;	/* inode size, bytes */
+	__uint16_t	sb_inopblock;	/* inodes per block */
+	char		sb_fname[12];	/* file system name */
+	__uint8_t	sb_blocklog;	/* log2 of sb_blocksize */
+	__uint8_t	sb_sectlog;	/* log2 of sb_sectsize */
+	__uint8_t	sb_inodelog;	/* log2 of sb_inodesize */
+	__uint8_t	sb_inopblog;	/* log2 of sb_inopblock */
+	__uint8_t	sb_agblklog;	/* log2 of sb_agblocks (rounded up) */
+	__uint8_t	sb_rextslog;	/* log2 of sb_rextents */
+	__uint8_t	sb_inprogress;	/* mkfs is in progress, don't mount */
+	__uint8_t	sb_imax_pct;	/* max % of fs for inode space */
+					/* statistics */
+	/*
+	 * These fields must remain contiguous.	 If you really
+	 * want to change their layout, make sure you fix the
+	 * code in xfs_trans_apply_sb_deltas().
+	 */
+	__uint64_t	sb_icount;	/* allocated inodes */
+	__uint64_t	sb_ifree;	/* free inodes */
+	__uint64_t	sb_fdblocks;	/* free data blocks */
+	__uint64_t	sb_frextents;	/* free realtime extents */
+	/*
+	 * End contiguous fields.
+	 */
+	xfs_ino_t	sb_uquotino;	/* user quota inode */
+	xfs_ino_t	sb_gquotino;	/* group quota inode */
+	__uint16_t	sb_qflags;	/* quota flags */
+	__uint8_t	sb_flags;	/* misc. flags */
+	__uint8_t	sb_shared_vn;	/* shared version number */
+	xfs_extlen_t	sb_inoalignmt;	/* inode chunk alignment, fsblocks */
+	__uint32_t	sb_unit;	/* stripe or raid unit */
+	__uint32_t	sb_width;	/* stripe or raid width */
+	__uint8_t	sb_dirblklog;	/* log2 of dir block size (fsbs) */
+	__uint8_t	sb_dummy[3];	/* padding */
+	__uint32_t	sb_logsunit;	/* stripe unit size for the log */
+} xfs_sb_t;
+
+/*
+ * Sequence number values for the fields.
+ */
+typedef enum {
+	XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS,
+	XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO,
+	XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS,
+	XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS,
+	XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE,
+	XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG,
+	XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG,
+	XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT,
+	XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO,
+	XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
+	XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
+	XFS_SBS_DUMMY, XFS_SBS_LOGSUNIT,
+	XFS_SBS_FIELDCOUNT
+} xfs_sb_field_t;
+
+/*
+ * Mask values, defined based on the xfs_sb_field_t values.
+ * Only define the ones we're using.
+ */
+#define XFS_SB_MVAL(x)		(1LL << XFS_SBS_ ## x)
+#define XFS_SB_UUID		XFS_SB_MVAL(UUID)
+#define XFS_SB_FNAME		XFS_SB_MVAL(FNAME)
+#define XFS_SB_ROOTINO		XFS_SB_MVAL(ROOTINO)
+#define XFS_SB_RBMINO		XFS_SB_MVAL(RBMINO)
+#define XFS_SB_RSUMINO		XFS_SB_MVAL(RSUMINO)
+#define XFS_SB_VERSIONNUM	XFS_SB_MVAL(VERSIONNUM)
+#define XFS_SB_UQUOTINO		XFS_SB_MVAL(UQUOTINO)
+#define XFS_SB_GQUOTINO		XFS_SB_MVAL(GQUOTINO)
+#define XFS_SB_QFLAGS		XFS_SB_MVAL(QFLAGS)
+#define XFS_SB_SHARED_VN	XFS_SB_MVAL(SHARED_VN)
+#define XFS_SB_UNIT		XFS_SB_MVAL(UNIT)
+#define XFS_SB_WIDTH		XFS_SB_MVAL(WIDTH)
+#define XFS_SB_NUM_BITS		((int)XFS_SBS_FIELDCOUNT)
+#define XFS_SB_ALL_BITS		((1LL << XFS_SB_NUM_BITS) - 1)
+#define XFS_SB_MOD_BITS		\
+	(XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \
+	 XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
+	 XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH)
+
+/*
+ * Misc. Flags - warning - these will be cleared by xfs_repair unless
+ * a feature bit is set when the flag is used.
+ */
+#define XFS_SBF_NOFLAGS		0x00	/* no flags set */
+#define XFS_SBF_READONLY	0x01	/* only read-only mounts allowed */
+
+/*
+ * define max. shared version we can interoperate with
+ */
+#define XFS_SB_MAX_SHARED_VN	0
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_NUM)
+int xfs_sb_version_num(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_NUM(sbp) xfs_sb_version_num(sbp)
+#else
+#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_GOOD_VERSION)
+int xfs_sb_good_version(xfs_sb_t *sbp);
+#define XFS_SB_GOOD_VERSION(sbp)	xfs_sb_good_version(sbp)
+#else
+#define XFS_SB_GOOD_VERSION_INT(sbp)	\
+	((((sbp)->sb_versionnum >= XFS_SB_VERSION_1) && \
+	  ((sbp)->sb_versionnum <= XFS_SB_VERSION_3)) || \
+	 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+	  !((sbp)->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS)
+#ifdef __KERNEL__
+#define XFS_SB_GOOD_VERSION(sbp)	\
+	(XFS_SB_GOOD_VERSION_INT(sbp) && \
+	  (sbp)->sb_shared_vn <= XFS_SB_MAX_SHARED_VN) ))
+#else
+/*
+ * extra 2 paren's here (( to unconfuse paren-matching editors
+ * like vi because XFS_SB_GOOD_VERSION_INT is a partial expression
+ * and the two XFS_SB_GOOD_VERSION's each 2 more close paren's to
+ * complete the expression.
+ */
+#define XFS_SB_GOOD_VERSION(sbp)	\
+	(XFS_SB_GOOD_VERSION_INT(sbp) && \
+	  (!((sbp)->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) || \
+	   (sbp)->sb_shared_vn <= XFS_SB_MAX_SHARED_VN)) ))
+#endif /* __KERNEL__ */
+#endif
+
+#define XFS_SB_GOOD_SASH_VERSION(sbp)	\
+	((((sbp)->sb_versionnum >= XFS_SB_VERSION_1) && \
+	  ((sbp)->sb_versionnum <= XFS_SB_VERSION_3)) || \
+	 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+	  !((sbp)->sb_versionnum & ~XFS_SB_VERSION_OKSASHBITS)))
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_TONEW)
+unsigned xfs_sb_version_tonew(unsigned v);
+#define XFS_SB_VERSION_TONEW(v) xfs_sb_version_tonew(v)
+#else
+#define XFS_SB_VERSION_TONEW(v) \
+	((((v) == XFS_SB_VERSION_1) ? \
+		0 : \
+		(((v) == XFS_SB_VERSION_2) ? \
+			XFS_SB_VERSION_ATTRBIT : \
+			(XFS_SB_VERSION_ATTRBIT | XFS_SB_VERSION_NLINKBIT))) | \
+	 XFS_SB_VERSION_4)
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_TOOLD)
+unsigned xfs_sb_version_toold(unsigned v);
+#define XFS_SB_VERSION_TOOLD(v) xfs_sb_version_toold(v)
+#else
+#define XFS_SB_VERSION_TOOLD(v) \
+	(((v) & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT)) ? \
+		0 : \
+		(((v) & XFS_SB_VERSION_NLINKBIT) ? \
+			XFS_SB_VERSION_3 : \
+			(((v) & XFS_SB_VERSION_ATTRBIT) ?  \
+				XFS_SB_VERSION_2 : \
+				XFS_SB_VERSION_1)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASATTR)
+int xfs_sb_version_hasattr(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASATTR(sbp)	xfs_sb_version_hasattr(sbp)
+#else
+#define XFS_SB_VERSION_HASATTR(sbp)	\
+	(((sbp)->sb_versionnum == XFS_SB_VERSION_2) || \
+	 ((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \
+	 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+	  ((sbp)->sb_versionnum & XFS_SB_VERSION_ATTRBIT)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDATTR)
+void xfs_sb_version_addattr(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_ADDATTR(sbp)	xfs_sb_version_addattr(sbp)
+#else
+#define XFS_SB_VERSION_ADDATTR(sbp)	\
+	((sbp)->sb_versionnum = \
+	 (((sbp)->sb_versionnum == XFS_SB_VERSION_1) ? \
+		XFS_SB_VERSION_2 : \
+		((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) ? \
+			((sbp)->sb_versionnum | XFS_SB_VERSION_ATTRBIT) : \
+			(XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT))))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASNLINK)
+int xfs_sb_version_hasnlink(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASNLINK(sbp)	xfs_sb_version_hasnlink(sbp)
+#else
+#define XFS_SB_VERSION_HASNLINK(sbp)	\
+	(((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \
+	 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+	  ((sbp)->sb_versionnum & XFS_SB_VERSION_NLINKBIT)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDNLINK)
+void xfs_sb_version_addnlink(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_ADDNLINK(sbp)	xfs_sb_version_addnlink(sbp)
+#else
+#define XFS_SB_VERSION_ADDNLINK(sbp)	\
+	((sbp)->sb_versionnum = \
+	 ((sbp)->sb_versionnum <= XFS_SB_VERSION_2 ? \
+		XFS_SB_VERSION_3 : \
+		((sbp)->sb_versionnum | XFS_SB_VERSION_NLINKBIT)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASQUOTA)
+int xfs_sb_version_hasquota(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASQUOTA(sbp)	xfs_sb_version_hasquota(sbp)
+#else
+#define XFS_SB_VERSION_HASQUOTA(sbp)	\
+	((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+	 ((sbp)->sb_versionnum & XFS_SB_VERSION_QUOTABIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDQUOTA)
+void xfs_sb_version_addquota(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_ADDQUOTA(sbp)	xfs_sb_version_addquota(sbp)
+#else
+#define XFS_SB_VERSION_ADDQUOTA(sbp)	\
+	((sbp)->sb_versionnum = \
+	 (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 ? \
+		((sbp)->sb_versionnum | XFS_SB_VERSION_QUOTABIT) : \
+		(XFS_SB_VERSION_TONEW((sbp)->sb_versionnum) | \
+		 XFS_SB_VERSION_QUOTABIT)))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASALIGN)
+int xfs_sb_version_hasalign(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASALIGN(sbp)	xfs_sb_version_hasalign(sbp)
+#else
+#define XFS_SB_VERSION_HASALIGN(sbp)	\
+	((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+	 ((sbp)->sb_versionnum & XFS_SB_VERSION_ALIGNBIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_SUBALIGN)
+void xfs_sb_version_subalign(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_SUBALIGN(sbp)	xfs_sb_version_subalign(sbp)
+#else
+#define XFS_SB_VERSION_SUBALIGN(sbp)	\
+	((sbp)->sb_versionnum = \
+	 XFS_SB_VERSION_TOOLD((sbp)->sb_versionnum & ~XFS_SB_VERSION_ALIGNBIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASDALIGN)
+int xfs_sb_version_hasdalign(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASDALIGN(sbp)	xfs_sb_version_hasdalign(sbp)
+#else
+#define XFS_SB_VERSION_HASDALIGN(sbp)	\
+	((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+	 ((sbp)->sb_versionnum & XFS_SB_VERSION_DALIGNBIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDDALIGN)
+int xfs_sb_version_adddalign(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_ADDDALIGN(sbp)	xfs_sb_version_adddalign(sbp)
+#else
+#define XFS_SB_VERSION_ADDDALIGN(sbp)	\
+	((sbp)->sb_versionnum = \
+		((sbp)->sb_versionnum | XFS_SB_VERSION_DALIGNBIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASSHARED)
+int xfs_sb_version_hasshared(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASSHARED(sbp)	xfs_sb_version_hasshared(sbp)
+#else
+#define XFS_SB_VERSION_HASSHARED(sbp)	\
+	((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+	 ((sbp)->sb_versionnum & XFS_SB_VERSION_SHAREDBIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDSHARED)
+int xfs_sb_version_addshared(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_ADDSHARED(sbp)	xfs_sb_version_addshared(sbp)
+#else
+#define XFS_SB_VERSION_ADDSHARED(sbp)	\
+	((sbp)->sb_versionnum = \
+		((sbp)->sb_versionnum | XFS_SB_VERSION_SHAREDBIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_SUBSHARED)
+int xfs_sb_version_subshared(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_SUBSHARED(sbp)	xfs_sb_version_subshared(sbp)
+#else
+#define XFS_SB_VERSION_SUBSHARED(sbp)	\
+	((sbp)->sb_versionnum = \
+		((sbp)->sb_versionnum & ~XFS_SB_VERSION_SHAREDBIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASDIRV2)
+int xfs_sb_version_hasdirv2(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASDIRV2(sbp)	xfs_sb_version_hasdirv2(sbp)
+#else
+#define XFS_SB_VERSION_HASDIRV2(sbp)	\
+	((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+	 ((sbp)->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASLOGV2)
+int xfs_sb_version_haslogv2(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASLOGV2(sbp)   xfs_sb_version_haslogv2(sbp)
+#else
+#define XFS_SB_VERSION_HASLOGV2(sbp)   \
+	((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+	((sbp)->sb_versionnum & XFS_SB_VERSION_LOGV2BIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_HASEXTFLGBIT)
+int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_HASEXTFLGBIT(sbp)	xfs_sb_version_hasextflgbit(sbp)
+#else
+#define XFS_SB_VERSION_HASEXTFLGBIT(sbp)	\
+	((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+	 ((sbp)->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_ADDEXTFLGBIT)
+int xfs_sb_version_addextflgbit(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_ADDEXTFLGBIT(sbp)	xfs_sb_version_addextflgbit(sbp)
+#else
+#define XFS_SB_VERSION_ADDEXTFLGBIT(sbp)	\
+	((sbp)->sb_versionnum = \
+		((sbp)->sb_versionnum | XFS_SB_VERSION_EXTFLGBIT))
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_VERSION_SUBEXTFLGBIT)
+int xfs_sb_version_subextflgbit(xfs_sb_t *sbp);
+#define XFS_SB_VERSION_SUBEXTFLGBIT(sbp)	xfs_sb_version_subextflgbit(sbp)
+#else
+#define XFS_SB_VERSION_SUBEXTFLGBIT(sbp)	\
+	((sbp)->sb_versionnum = \
+		((sbp)->sb_versionnum & ~XFS_SB_VERSION_EXTFLGBIT))
+#endif
+
+/*
+ * end of superblock version macros
+ */
+
+#define XFS_SB_DADDR	((xfs_daddr_t)0)		/* daddr in filesystem/ag */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_SB_BLOCK)
+xfs_agblock_t xfs_sb_block(struct xfs_mount *mp);
+#define XFS_SB_BLOCK(mp)	xfs_sb_block(mp)
+#else
+#define XFS_SB_BLOCK(mp)	XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
+#endif
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_HDR_BLOCK)
+xfs_agblock_t xfs_hdr_block(struct xfs_mount *mp, xfs_daddr_t d);
+#define XFS_HDR_BLOCK(mp,d)	xfs_hdr_block(mp,d)
+#else
+#define XFS_HDR_BLOCK(mp,d)	((xfs_agblock_t)(XFS_BB_TO_FSBT(mp,d)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_DADDR_TO_FSB)
+xfs_fsblock_t xfs_daddr_to_fsb(struct xfs_mount *mp, xfs_daddr_t d);
+#define XFS_DADDR_TO_FSB(mp,d)		xfs_daddr_to_fsb(mp,d)
+#else
+#define XFS_DADDR_TO_FSB(mp,d) \
+	XFS_AGB_TO_FSB(mp, XFS_DADDR_TO_AGNO(mp,d), XFS_DADDR_TO_AGBNO(mp,d))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_FSB_TO_DADDR)
+xfs_daddr_t xfs_fsb_to_daddr(struct xfs_mount *mp, xfs_fsblock_t fsbno);
+#define XFS_FSB_TO_DADDR(mp,fsbno)	xfs_fsb_to_daddr(mp,fsbno)
+#else
+#define XFS_FSB_TO_DADDR(mp,fsbno) \
+	XFS_AGB_TO_DADDR(mp, XFS_FSB_TO_AGNO(mp,fsbno), \
+			 XFS_FSB_TO_AGBNO(mp,fsbno))
+#endif
+
+/*
+ * File system block to basic block conversions.
+ */
+#define XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log)
+#define XFS_BB_TO_FSB(mp,bb)	\
+	(((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
+#define XFS_BB_TO_FSBT(mp,bb)	((bb) >> (mp)->m_blkbb_log)
+#define XFS_BB_FSB_OFFSET(mp,bb) ((bb) & ((mp)->m_bsize - 1))
+
+/*
+ * File system block to byte conversions.
+ */
+#define XFS_FSB_TO_B(mp,fsbno)	((xfs_fsize_t)(fsbno) << \
+				 (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSB(mp,b)	\
+	((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog)
+#define XFS_B_TO_FSBT(mp,b)	(((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
+#define XFS_B_FSB_OFFSET(mp,b)	((b) & (mp)->m_blockmask)
+
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_SBP)
+xfs_sb_t *xfs_buf_to_sbp(struct xfs_buf *bp);
+#define XFS_BUF_TO_SBP(bp)	xfs_buf_to_sbp(bp)
+#else
+#define XFS_BUF_TO_SBP(bp)	((xfs_sb_t *)XFS_BUF_PTR(bp))
+#endif
+
+#endif	/* __XFS_SB_H__ */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
new file mode 100644
index 000000000000..292ad257bc38
--- /dev/null
+++ b/fs/xfs/xfs_trans.c
@@ -0,0 +1,1268 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+STATIC void	xfs_trans_apply_sb_deltas(xfs_trans_t *);
+STATIC uint	xfs_trans_count_vecs(xfs_trans_t *);
+STATIC void	xfs_trans_fill_vecs(xfs_trans_t *, xfs_log_iovec_t *);
+STATIC void	xfs_trans_uncommit(xfs_trans_t *, uint);
+STATIC void	xfs_trans_committed(xfs_trans_t *, int);
+STATIC void	xfs_trans_chunk_committed(xfs_log_item_chunk_t *, xfs_lsn_t, int);
+STATIC void	xfs_trans_free(xfs_trans_t *);
+
+kmem_zone_t		*xfs_trans_zone;
+
+
+/*
+ * Initialize the precomputed transaction reservation values
+ * in the mount structure.
+ */
+void
+xfs_trans_init(
+	xfs_mount_t	*mp)
+{
+	xfs_trans_reservations_t	*resp;
+
+	resp = &(mp->m_reservations);
+	resp->tr_write =
+		(uint)(XFS_CALC_WRITE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_itruncate =
+		(uint)(XFS_CALC_ITRUNCATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_rename =
+		(uint)(XFS_CALC_RENAME_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_link = (uint)XFS_CALC_LINK_LOG_RES(mp);
+	resp->tr_remove =
+		(uint)(XFS_CALC_REMOVE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_symlink =
+		(uint)(XFS_CALC_SYMLINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_create =
+		(uint)(XFS_CALC_CREATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_mkdir =
+		(uint)(XFS_CALC_MKDIR_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_ifree =
+		(uint)(XFS_CALC_IFREE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_ichange =
+		(uint)(XFS_CALC_ICHANGE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_growdata = (uint)XFS_CALC_GROWDATA_LOG_RES(mp);
+	resp->tr_swrite = (uint)XFS_CALC_SWRITE_LOG_RES(mp);
+	resp->tr_writeid = (uint)XFS_CALC_WRITEID_LOG_RES(mp);
+	resp->tr_addafork =
+		(uint)(XFS_CALC_ADDAFORK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_attrinval = (uint)XFS_CALC_ATTRINVAL_LOG_RES(mp);
+	resp->tr_attrset =
+		(uint)(XFS_CALC_ATTRSET_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_attrrm =
+		(uint)(XFS_CALC_ATTRRM_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp));
+	resp->tr_clearagi = (uint)XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp);
+	resp->tr_growrtalloc = (uint)XFS_CALC_GROWRTALLOC_LOG_RES(mp);
+	resp->tr_growrtzero = (uint)XFS_CALC_GROWRTZERO_LOG_RES(mp);
+	resp->tr_growrtfree = (uint)XFS_CALC_GROWRTFREE_LOG_RES(mp);
+}
+
+/*
+ * This routine is called to allocate a transaction structure.
+ * The type parameter indicates the type of the transaction.  These
+ * are enumerated in xfs_trans.h.
+ *
+ * Dynamically allocate the transaction structure from the transaction
+ * zone, initialize it, and return it to the caller.
+ */
+xfs_trans_t *
+xfs_trans_alloc(
+	xfs_mount_t	*mp,
+	uint		type)
+{
+	xfs_check_frozen(mp, NULL, XFS_FREEZE_TRANS);
+	return (_xfs_trans_alloc(mp, type));
+
+}
+
+xfs_trans_t *
+_xfs_trans_alloc(
+	xfs_mount_t	*mp,
+	uint		type)
+{
+	xfs_trans_t	*tp;
+	ASSERT(xfs_trans_zone != NULL);
+	tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+	tp->t_dqinfo = NULL;
+
+	/*
+	 * Initialize the transaction structure.
+	 */
+	tp->t_magic = XFS_TRANS_MAGIC;
+	tp->t_type = type;
+	tp->t_mountp = mp;
+	tp->t_items_free = XFS_LIC_NUM_SLOTS;
+	tp->t_busy_free = XFS_LBC_NUM_SLOTS;
+	XFS_LIC_INIT(&(tp->t_items));
+	XFS_LBC_INIT(&(tp->t_busy));
+
+	return (tp);
+}
+
+/*
+ * This is called to create a new transaction which will share the
+ * permanent log reservation of the given transaction.	The remaining
+ * unused block and rt extent reservations are also inherited.	This
+ * implies that the original transaction is no longer allowed to allocate
+ * blocks.  Locks and log items, however, are no inherited.  They must
+ * be added to the new transaction explicitly.
+ */
+xfs_trans_t *
+xfs_trans_dup(
+	xfs_trans_t	*tp)
+{
+	xfs_trans_t	*ntp;
+
+	ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
+
+	/*
+	 * Initialize the new transaction structure.
+	 */
+	ntp->t_magic = XFS_TRANS_MAGIC;
+	ntp->t_type = tp->t_type;
+	ntp->t_mountp = tp->t_mountp;
+	ntp->t_items_free = XFS_LIC_NUM_SLOTS;
+	ntp->t_busy_free = XFS_LBC_NUM_SLOTS;
+	XFS_LIC_INIT(&(ntp->t_items));
+	XFS_LBC_INIT(&(ntp->t_busy));
+
+	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+#if defined(XLOG_NOLOG) || defined(DEBUG)
+	ASSERT(!xlog_debug || tp->t_ticket != NULL);
+#else
+	ASSERT(tp->t_ticket != NULL);
+#endif
+	ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE);
+	ntp->t_ticket = tp->t_ticket;
+	ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
+	tp->t_blk_res = tp->t_blk_res_used;
+	ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
+	tp->t_rtx_res = tp->t_rtx_res_used;
+
+	/*
+	 * dup the dquot stuff too.
+	 */
+	if (tp->t_dqinfo)
+		xfs_trans_dup_dqinfo(tp, ntp);
+
+	atomic_inc(&tp->t_mountp->m_active_trans);
+	return ntp;
+}
+
+/*
+ * This is called to reserve free disk blocks and log space for the
+ * given transaction.  This must be done before allocating any resources
+ * within the transaction.
+ *
+ * This will return ENOSPC if there are not enough blocks available.
+ * It will sleep waiting for available log space.
+ * The only valid value for the flags parameter is XFS_RES_LOG_PERM, which
+ * is used by long running transactions.  If any one of the reservations
+ * fails then they will all be backed out.
+ *
+ * This does not do quota reservations. That typically is done by the
+ * caller afterwards.
+ */
+int
+xfs_trans_reserve(
+	xfs_trans_t	*tp,
+	uint		blocks,
+	uint		logspace,
+	uint		rtextents,
+	uint		flags,
+	uint		logcount)
+{
+	int		log_flags;
+	int		error;
+	int	rsvd;
+
+	error = 0;
+	rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+
+	/* Mark this thread as being in a transaction */
+	current->flags |= PF_FSTRANS;
+
+	/*
+	 * Attempt to reserve the needed disk blocks by decrementing
+	 * the number needed from the number available.	 This will
+	 * fail if the count would go below zero.
+	 */
+	if (blocks > 0) {
+		error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS,
+					  -blocks, rsvd);
+		if (error != 0) {
+			current->flags &= ~PF_FSTRANS;
+			return (XFS_ERROR(ENOSPC));
+		}
+		tp->t_blk_res += blocks;
+	}
+
+	/*
+	 * Reserve the log space needed for this transaction.
+	 */
+	if (logspace > 0) {
+		ASSERT((tp->t_log_res == 0) || (tp->t_log_res == logspace));
+		ASSERT((tp->t_log_count == 0) ||
+			(tp->t_log_count == logcount));
+		if (flags & XFS_TRANS_PERM_LOG_RES) {
+			log_flags = XFS_LOG_PERM_RESERV;
+			tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
+		} else {
+			ASSERT(tp->t_ticket == NULL);
+			ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES));
+			log_flags = 0;
+		}
+
+		error = xfs_log_reserve(tp->t_mountp, logspace, logcount,
+					&tp->t_ticket,
+					XFS_TRANSACTION, log_flags);
+		if (error) {
+			goto undo_blocks;
+		}
+		tp->t_log_res = logspace;
+		tp->t_log_count = logcount;
+	}
+
+	/*
+	 * Attempt to reserve the needed realtime extents by decrementing
+	 * the number needed from the number available.	 This will
+	 * fail if the count would go below zero.
+	 */
+	if (rtextents > 0) {
+		error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS,
+					  -rtextents, rsvd);
+		if (error) {
+			error = XFS_ERROR(ENOSPC);
+			goto undo_log;
+		}
+		tp->t_rtx_res += rtextents;
+	}
+
+	return 0;
+
+	/*
+	 * Error cases jump to one of these labels to undo any
+	 * reservations which have already been performed.
+	 */
+undo_log:
+	if (logspace > 0) {
+		if (flags & XFS_TRANS_PERM_LOG_RES) {
+			log_flags = XFS_LOG_REL_PERM_RESERV;
+		} else {
+			log_flags = 0;
+		}
+		xfs_log_done(tp->t_mountp, tp->t_ticket, log_flags);
+		tp->t_ticket = NULL;
+		tp->t_log_res = 0;
+		tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
+	}
+
+undo_blocks:
+	if (blocks > 0) {
+		(void) xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS,
+					 blocks, rsvd);
+		tp->t_blk_res = 0;
+	}
+
+	current->flags &= ~PF_FSTRANS;
+
+	return (error);
+}
+
+
+/*
+ * This is called to set the a callback to be called when the given
+ * transaction is committed to disk.  The transaction pointer and the
+ * argument pointer will be passed to the callback routine.
+ *
+ * Only one callback can be associated with any single transaction.
+ */
+void
+xfs_trans_callback(
+	xfs_trans_t		*tp,
+	xfs_trans_callback_t	callback,
+	void			*arg)
+{
+	ASSERT(tp->t_callback == NULL);
+	tp->t_callback = callback;
+	tp->t_callarg = arg;
+}
+
+
+/*
+ * Record the indicated change to the given field for application
+ * to the file system's superblock when the transaction commits.
+ * For now, just store the change in the transaction structure.
+ *
+ * Mark the transaction structure to indicate that the superblock
+ * needs to be updated before committing.
+ */
+void
+xfs_trans_mod_sb(
+	xfs_trans_t	*tp,
+	uint		field,
+	long		delta)
+{
+
+	switch (field) {
+	case XFS_TRANS_SB_ICOUNT:
+		ASSERT(delta > 0);
+		tp->t_icount_delta += delta;
+		break;
+	case XFS_TRANS_SB_IFREE:
+		tp->t_ifree_delta += delta;
+		break;
+	case XFS_TRANS_SB_FDBLOCKS:
+		/*
+		 * Track the number of blocks allocated in the
+		 * transaction.	 Make sure it does not exceed the
+		 * number reserved.
+		 */
+		if (delta < 0) {
+			tp->t_blk_res_used += (uint)-delta;
+			ASSERT(tp->t_blk_res_used <= tp->t_blk_res);
+		}
+		tp->t_fdblocks_delta += delta;
+		break;
+	case XFS_TRANS_SB_RES_FDBLOCKS:
+		/*
+		 * The allocation has already been applied to the
+		 * in-core superblock's counter.  This should only
+		 * be applied to the on-disk superblock.
+		 */
+		ASSERT(delta < 0);
+		tp->t_res_fdblocks_delta += delta;
+		break;
+	case XFS_TRANS_SB_FREXTENTS:
+		/*
+		 * Track the number of blocks allocated in the
+		 * transaction.	 Make sure it does not exceed the
+		 * number reserved.
+		 */
+		if (delta < 0) {
+			tp->t_rtx_res_used += (uint)-delta;
+			ASSERT(tp->t_rtx_res_used <= tp->t_rtx_res);
+		}
+		tp->t_frextents_delta += delta;
+		break;
+	case XFS_TRANS_SB_RES_FREXTENTS:
+		/*
+		 * The allocation has already been applied to the
+		 * in-core superblocks's counter.  This should only
+		 * be applied to the on-disk superblock.
+		 */
+		ASSERT(delta < 0);
+		tp->t_res_frextents_delta += delta;
+		break;
+	case XFS_TRANS_SB_DBLOCKS:
+		ASSERT(delta > 0);
+		tp->t_dblocks_delta += delta;
+		break;
+	case XFS_TRANS_SB_AGCOUNT:
+		ASSERT(delta > 0);
+		tp->t_agcount_delta += delta;
+		break;
+	case XFS_TRANS_SB_IMAXPCT:
+		tp->t_imaxpct_delta += delta;
+		break;
+	case XFS_TRANS_SB_REXTSIZE:
+		tp->t_rextsize_delta += delta;
+		break;
+	case XFS_TRANS_SB_RBMBLOCKS:
+		tp->t_rbmblocks_delta += delta;
+		break;
+	case XFS_TRANS_SB_RBLOCKS:
+		tp->t_rblocks_delta += delta;
+		break;
+	case XFS_TRANS_SB_REXTENTS:
+		tp->t_rextents_delta += delta;
+		break;
+	case XFS_TRANS_SB_REXTSLOG:
+		tp->t_rextslog_delta += delta;
+		break;
+	default:
+		ASSERT(0);
+		return;
+	}
+
+	tp->t_flags |= (XFS_TRANS_SB_DIRTY | XFS_TRANS_DIRTY);
+}
+
+/*
+ * xfs_trans_apply_sb_deltas() is called from the commit code
+ * to bring the superblock buffer into the current transaction
+ * and modify it as requested by earlier calls to xfs_trans_mod_sb().
+ *
+ * For now we just look at each field allowed to change and change
+ * it if necessary.
+ */
+STATIC void
+xfs_trans_apply_sb_deltas(
+	xfs_trans_t	*tp)
+{
+	xfs_sb_t	*sbp;
+	xfs_buf_t	*bp;
+	int		whole = 0;
+
+	bp = xfs_trans_getsb(tp, tp->t_mountp, 0);
+	sbp = XFS_BUF_TO_SBP(bp);
+
+	/*
+	 * Check that superblock mods match the mods made to AGF counters.
+	 */
+	ASSERT((tp->t_fdblocks_delta + tp->t_res_fdblocks_delta) ==
+	       (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta +
+		tp->t_ag_btree_delta));
+
+	if (tp->t_icount_delta != 0) {
+		INT_MOD(sbp->sb_icount, ARCH_CONVERT, tp->t_icount_delta);
+	}
+	if (tp->t_ifree_delta != 0) {
+		INT_MOD(sbp->sb_ifree, ARCH_CONVERT, tp->t_ifree_delta);
+	}
+
+	if (tp->t_fdblocks_delta != 0) {
+		INT_MOD(sbp->sb_fdblocks, ARCH_CONVERT, tp->t_fdblocks_delta);
+	}
+	if (tp->t_res_fdblocks_delta != 0) {
+		INT_MOD(sbp->sb_fdblocks, ARCH_CONVERT, tp->t_res_fdblocks_delta);
+	}
+
+	if (tp->t_frextents_delta != 0) {
+		INT_MOD(sbp->sb_frextents, ARCH_CONVERT, tp->t_frextents_delta);
+	}
+	if (tp->t_dblocks_delta != 0) {
+		INT_MOD(sbp->sb_dblocks, ARCH_CONVERT, tp->t_dblocks_delta);
+		whole = 1;
+	}
+	if (tp->t_agcount_delta != 0) {
+		INT_MOD(sbp->sb_agcount, ARCH_CONVERT, tp->t_agcount_delta);
+		whole = 1;
+	}
+	if (tp->t_imaxpct_delta != 0) {
+		INT_MOD(sbp->sb_imax_pct, ARCH_CONVERT, tp->t_imaxpct_delta);
+		whole = 1;
+	}
+	if (tp->t_rextsize_delta != 0) {
+		INT_MOD(sbp->sb_rextsize, ARCH_CONVERT, tp->t_rextsize_delta);
+		whole = 1;
+	}
+	if (tp->t_rbmblocks_delta != 0) {
+		INT_MOD(sbp->sb_rbmblocks, ARCH_CONVERT, tp->t_rbmblocks_delta);
+		whole = 1;
+	}
+	if (tp->t_rblocks_delta != 0) {
+		INT_MOD(sbp->sb_rblocks, ARCH_CONVERT, tp->t_rblocks_delta);
+		whole = 1;
+	}
+	if (tp->t_rextents_delta != 0) {
+		INT_MOD(sbp->sb_rextents, ARCH_CONVERT, tp->t_rextents_delta);
+		whole = 1;
+	}
+	if (tp->t_rextslog_delta != 0) {
+		INT_MOD(sbp->sb_rextslog, ARCH_CONVERT, tp->t_rextslog_delta);
+		whole = 1;
+	}
+
+	if (whole)
+		/*
+		 * Log the whole thing, the fields are discontiguous.
+		 */
+		xfs_trans_log_buf(tp, bp, 0, sizeof(xfs_sb_t) - 1);
+	else
+		/*
+		 * Since all the modifiable fields are contiguous, we
+		 * can get away with this.
+		 */
+		xfs_trans_log_buf(tp, bp, offsetof(xfs_sb_t, sb_icount),
+				  offsetof(xfs_sb_t, sb_frextents) +
+				  sizeof(sbp->sb_frextents) - 1);
+
+	XFS_MTOVFS(tp->t_mountp)->vfs_super->s_dirt = 1;
+}
+
+/*
+ * xfs_trans_unreserve_and_mod_sb() is called to release unused
+ * reservations and apply superblock counter changes to the in-core
+ * superblock.
+ *
+ * This is done efficiently with a single call to xfs_mod_incore_sb_batch().
+ */
+void
+xfs_trans_unreserve_and_mod_sb(
+	xfs_trans_t	*tp)
+{
+	xfs_mod_sb_t	msb[14];	/* If you add cases, add entries */
+	xfs_mod_sb_t	*msbp;
+	/* REFERENCED */
+	int		error;
+	int		rsvd;
+
+	msbp = msb;
+	rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+
+	/*
+	 * Release any reserved blocks.	 Any that were allocated
+	 * will be taken back again by fdblocks_delta below.
+	 */
+	if (tp->t_blk_res > 0) {
+		msbp->msb_field = XFS_SBS_FDBLOCKS;
+		msbp->msb_delta = tp->t_blk_res;
+		msbp++;
+	}
+
+	/*
+	 * Release any reserved real time extents .  Any that were
+	 * allocated will be taken back again by frextents_delta below.
+	 */
+	if (tp->t_rtx_res > 0) {
+		msbp->msb_field = XFS_SBS_FREXTENTS;
+		msbp->msb_delta = tp->t_rtx_res;
+		msbp++;
+	}
+
+	/*
+	 * Apply any superblock modifications to the in-core version.
+	 * The t_res_fdblocks_delta and t_res_frextents_delta fields are
+	 * explicity NOT applied to the in-core superblock.
+	 * The idea is that that has already been done.
+	 */
+	if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
+		if (tp->t_icount_delta != 0) {
+			msbp->msb_field = XFS_SBS_ICOUNT;
+			msbp->msb_delta = (int)tp->t_icount_delta;
+			msbp++;
+		}
+		if (tp->t_ifree_delta != 0) {
+			msbp->msb_field = XFS_SBS_IFREE;
+			msbp->msb_delta = (int)tp->t_ifree_delta;
+			msbp++;
+		}
+		if (tp->t_fdblocks_delta != 0) {
+			msbp->msb_field = XFS_SBS_FDBLOCKS;
+			msbp->msb_delta = (int)tp->t_fdblocks_delta;
+			msbp++;
+		}
+		if (tp->t_frextents_delta != 0) {
+			msbp->msb_field = XFS_SBS_FREXTENTS;
+			msbp->msb_delta = (int)tp->t_frextents_delta;
+			msbp++;
+		}
+		if (tp->t_dblocks_delta != 0) {
+			msbp->msb_field = XFS_SBS_DBLOCKS;
+			msbp->msb_delta = (int)tp->t_dblocks_delta;
+			msbp++;
+		}
+		if (tp->t_agcount_delta != 0) {
+			msbp->msb_field = XFS_SBS_AGCOUNT;
+			msbp->msb_delta = (int)tp->t_agcount_delta;
+			msbp++;
+		}
+		if (tp->t_imaxpct_delta != 0) {
+			msbp->msb_field = XFS_SBS_IMAX_PCT;
+			msbp->msb_delta = (int)tp->t_imaxpct_delta;
+			msbp++;
+		}
+		if (tp->t_rextsize_delta != 0) {
+			msbp->msb_field = XFS_SBS_REXTSIZE;
+			msbp->msb_delta = (int)tp->t_rextsize_delta;
+			msbp++;
+		}
+		if (tp->t_rbmblocks_delta != 0) {
+			msbp->msb_field = XFS_SBS_RBMBLOCKS;
+			msbp->msb_delta = (int)tp->t_rbmblocks_delta;
+			msbp++;
+		}
+		if (tp->t_rblocks_delta != 0) {
+			msbp->msb_field = XFS_SBS_RBLOCKS;
+			msbp->msb_delta = (int)tp->t_rblocks_delta;
+			msbp++;
+		}
+		if (tp->t_rextents_delta != 0) {
+			msbp->msb_field = XFS_SBS_REXTENTS;
+			msbp->msb_delta = (int)tp->t_rextents_delta;
+			msbp++;
+		}
+		if (tp->t_rextslog_delta != 0) {
+			msbp->msb_field = XFS_SBS_REXTSLOG;
+			msbp->msb_delta = (int)tp->t_rextslog_delta;
+			msbp++;
+		}
+	}
+
+	/*
+	 * If we need to change anything, do it.
+	 */
+	if (msbp > msb) {
+		error = xfs_mod_incore_sb_batch(tp->t_mountp, msb,
+			(uint)(msbp - msb), rsvd);
+		ASSERT(error == 0);
+	}
+}
+
+
+/*
+ * xfs_trans_commit
+ *
+ * Commit the given transaction to the log a/synchronously.
+ *
+ * XFS disk error handling mechanism is not based on a typical
+ * transaction abort mechanism. Logically after the filesystem
+ * gets marked 'SHUTDOWN', we can't let any new transactions
+ * be durable - ie. committed to disk - because some metadata might
+ * be inconsistent. In such cases, this returns an error, and the
+ * caller may assume that all locked objects joined to the transaction
+ * have already been unlocked as if the commit had succeeded.
+ * It's illegal to reference the transaction structure after this call.
+ */
+ /*ARGSUSED*/
+int
+xfs_trans_commit(
+	xfs_trans_t	*tp,
+	uint		flags,
+	xfs_lsn_t	*commit_lsn_p)
+{
+	xfs_log_iovec_t		*log_vector;
+	int			nvec;
+	xfs_mount_t		*mp;
+	xfs_lsn_t		commit_lsn;
+	/* REFERENCED */
+	int			error;
+	int			log_flags;
+	int			sync;
+#define XFS_TRANS_LOGVEC_COUNT	16
+	xfs_log_iovec_t		log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
+#if defined(XLOG_NOLOG) || defined(DEBUG)
+	static xfs_lsn_t	trans_lsn = 1;
+#endif
+	int			shutdown;
+
+	commit_lsn = -1;
+
+	/*
+	 * Determine whether this commit is releasing a permanent
+	 * log reservation or not.
+	 */
+	if (flags & XFS_TRANS_RELEASE_LOG_RES) {
+		ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+		log_flags = XFS_LOG_REL_PERM_RESERV;
+	} else {
+		log_flags = 0;
+	}
+	mp = tp->t_mountp;
+
+	/*
+	 * If there is nothing to be logged by the transaction,
+	 * then unlock all of the items associated with the
+	 * transaction and free the transaction structure.
+	 * Also make sure to return any reserved blocks to
+	 * the free pool.
+	 */
+shut_us_down:
+	shutdown = XFS_FORCED_SHUTDOWN(mp) ? EIO : 0;
+	if (!(tp->t_flags & XFS_TRANS_DIRTY) || shutdown) {
+		xfs_trans_unreserve_and_mod_sb(tp);
+		/*
+		 * It is indeed possible for the transaction to be
+		 * not dirty but the dqinfo portion to be. All that
+		 * means is that we have some (non-persistent) quota
+		 * reservations that need to be unreserved.
+		 */
+		if (tp->t_dqinfo && (tp->t_flags & XFS_TRANS_DQ_DIRTY)) {
+			xfs_trans_unreserve_and_mod_dquots(tp);
+		}
+		if (tp->t_ticket) {
+			commit_lsn = xfs_log_done(mp, tp->t_ticket, log_flags);
+			if (commit_lsn == -1 && !shutdown)
+				shutdown = XFS_ERROR(EIO);
+		}
+		xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0);
+		xfs_trans_free_busy(tp);
+		xfs_trans_free(tp);
+		XFS_STATS_INC(xfsstats.xs_trans_empty);
+		if (commit_lsn_p)
+			*commit_lsn_p = commit_lsn;
+		current->flags &= ~PF_FSTRANS;
+		return (shutdown);
+	}
+#if defined(XLOG_NOLOG) || defined(DEBUG)
+	ASSERT(!xlog_debug || tp->t_ticket != NULL);
+#else
+	ASSERT(tp->t_ticket != NULL);
+#endif
+
+	/*
+	 * If we need to update the superblock, then do it now.
+	 */
+	if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
+		xfs_trans_apply_sb_deltas(tp);
+	}
+	if (tp->t_flags & XFS_TRANS_DQ_DIRTY) {
+		xfs_trans_apply_dquot_deltas(tp);
+	}
+
+	/*
+	 * Ask each log item how many log_vector entries it will
+	 * need so we can figure out how many to allocate.
+	 * Try to avoid the kmem_alloc() call in the common case
+	 * by using a vector from the stack when it fits.
+	 */
+	nvec = xfs_trans_count_vecs(tp);
+
+	if (nvec == 0) {
+		xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
+		goto shut_us_down;
+	}
+
+
+	if (nvec <= XFS_TRANS_LOGVEC_COUNT) {
+		log_vector = log_vector_fast;
+	} else {
+		log_vector = (xfs_log_iovec_t *)kmem_alloc(nvec *
+						   sizeof(xfs_log_iovec_t),
+						   KM_SLEEP);
+	}
+
+	/*
+	 * Fill in the log_vector and pin the logged items, and
+	 * then write the transaction to the log.
+	 */
+	xfs_trans_fill_vecs(tp, log_vector);
+
+	/*
+	 * Ignore errors here. xfs_log_done would do the right thing.
+	 * We need to put the ticket, etc. away.
+	 */
+	error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket,
+			     &(tp->t_lsn));
+
+#if defined(XLOG_NOLOG) || defined(DEBUG)
+	if (xlog_debug) {
+		commit_lsn = xfs_log_done(mp, tp->t_ticket,
+					  log_flags);
+	} else {
+		commit_lsn = 0;
+		tp->t_lsn = trans_lsn++;
+	}
+#else
+	/*
+	 * This is the regular case.  At this point (after the call finishes),
+	 * the transaction is committed incore and could go out to disk at
+	 * any time.  However, all the items associated with the transaction
+	 * are still locked and pinned in memory.
+	 */
+	commit_lsn = xfs_log_done(mp, tp->t_ticket, log_flags);
+#endif
+
+	if (nvec > XFS_TRANS_LOGVEC_COUNT) {
+		kmem_free(log_vector, nvec * sizeof(xfs_log_iovec_t));
+	}
+
+	if (commit_lsn_p)
+		*commit_lsn_p = commit_lsn;
+
+	/*
+	 * If we got a log write error. Unpin the logitems that we
+	 * had pinned, clean up, free trans structure, and return error.
+	 */
+	if (error || commit_lsn == -1) {
+		xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT);
+		current->flags &= ~PF_FSTRANS;
+		return XFS_ERROR(EIO);
+	}
+
+	/*
+	 * Once all the items of the transaction have been copied
+	 * to the in core log we can release them.  Do that here.
+	 * This will free descriptors pointing to items which were
+	 * not logged since there is nothing more to do with them.
+	 * For items which were logged, we will keep pointers to them
+	 * so they can be unpinned after the transaction commits to disk.
+	 * This will also stamp each modified meta-data item with
+	 * the commit lsn of this transaction for dependency tracking
+	 * purposes.
+	 */
+	xfs_trans_unlock_items(tp, commit_lsn);
+
+	/*
+	 * Once the transaction has committed, unused
+	 * reservations need to be released and changes to
+	 * the superblock need to be reflected in the in-core
+	 * version.  Do that now.
+	 */
+	xfs_trans_unreserve_and_mod_sb(tp);
+
+	sync = tp->t_flags & XFS_TRANS_SYNC;
+
+	/*
+	 * Tell the LM to call the transaction completion routine
+	 * when the log write with LSN commit_lsn completes (e.g.
+	 * when the transaction commit really hits the on-disk log).
+	 * After this call we cannot reference tp, because the call
+	 * can happen at any time and the call will free the transaction
+	 * structure pointed to by tp.	The only case where we call
+	 * the completion routine (xfs_trans_committed) directly is
+	 * if the log is turned off on a debug kernel or we're
+	 * running in simulation mode (the log is explicitly turned
+	 * off).
+	 */
+#if defined(XLOG_NOLOG) || defined(DEBUG)
+	if (xlog_debug) {
+		tp->t_logcb.cb_func = (void(*)(void*, int))xfs_trans_committed;
+		tp->t_logcb.cb_arg = tp;
+		xfs_log_notify(mp, commit_lsn, &(tp->t_logcb));
+	} else {
+		xfs_trans_committed(tp, 0);
+	}
+#else
+	tp->t_logcb.cb_func = (void(*)(void*, int))xfs_trans_committed;
+	tp->t_logcb.cb_arg = tp;
+	xfs_log_notify(mp, commit_lsn, &(tp->t_logcb));
+#endif
+	/*
+	 * If the transaction needs to be synchronous, then force the
+	 * log out now and wait for it.
+	 */
+	if (sync) {
+		error = xfs_log_force(mp, commit_lsn,
+				      XFS_LOG_FORCE | XFS_LOG_SYNC);
+		XFS_STATS_INC(xfsstats.xs_trans_sync);
+	} else {
+		XFS_STATS_INC(xfsstats.xs_trans_async);
+	}
+
+	/* mark this thread as no longer being in a transaction */
+	current->flags &= ~PF_FSTRANS;
+
+	return (error);
+}
+
+
+/*
+ * Total up the number of log iovecs needed to commit this
+ * transaction.	 The transaction itself needs one for the
+ * transaction header.	Ask each dirty item in turn how many
+ * it needs to get the total.
+ */
+STATIC uint
+xfs_trans_count_vecs(
+	xfs_trans_t	*tp)
+{
+	int			nvecs;
+	xfs_log_item_desc_t	*lidp;
+
+	nvecs = 1;
+	lidp = xfs_trans_first_item(tp);
+	ASSERT(lidp != NULL);
+
+	/* In the non-debug case we need to start bailing out if we
+	 * didn't find a log_item here, return zero and let trans_commit
+	 * deal with it.
+	 */
+	if (lidp == NULL)
+		return 0;
+
+	while (lidp != NULL) {
+		/*
+		 * Skip items which aren't dirty in this transaction.
+		 */
+		if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+			lidp = xfs_trans_next_item(tp, lidp);
+			continue;
+		}
+		lidp->lid_size = IOP_SIZE(lidp->lid_item);
+		nvecs += lidp->lid_size;
+		lidp = xfs_trans_next_item(tp, lidp);
+	}
+
+	return nvecs;
+}
+
+/*
+ * Called from the trans_commit code when we notice that
+ * the filesystem is in the middle of a forced shutdown.
+ */
+STATIC void
+xfs_trans_uncommit(
+	xfs_trans_t	*tp,
+	uint		flags)
+{
+	xfs_log_item_desc_t	*lidp;
+
+	for (lidp = xfs_trans_first_item(tp);
+	     lidp != NULL;
+	     lidp = xfs_trans_next_item(tp, lidp)) {
+		/*
+		 * Unpin all but those that aren't dirty.
+		 */
+		if (lidp->lid_flags & XFS_LID_DIRTY)
+			IOP_UNPIN_REMOVE(lidp->lid_item, tp);
+	}
+
+	xfs_trans_unreserve_and_mod_sb(tp);
+	if (tp->t_dqinfo && (tp->t_flags & XFS_TRANS_DQ_DIRTY)) {
+		xfs_trans_unreserve_and_mod_dquots(tp);
+	}
+
+	xfs_trans_free_items(tp, flags);
+	xfs_trans_free_busy(tp);
+	xfs_trans_free(tp);
+}
+
+/*
+ * Fill in the vector with pointers to data to be logged
+ * by this transaction.	 The transaction header takes
+ * the first vector, and then each dirty item takes the
+ * number of vectors it indicated it needed in xfs_trans_count_vecs().
+ *
+ * As each item fills in the entries it needs, also pin the item
+ * so that it cannot be flushed out until the log write completes.
+ */
+STATIC void
+xfs_trans_fill_vecs(
+	xfs_trans_t		*tp,
+	xfs_log_iovec_t		*log_vector)
+{
+	xfs_log_item_desc_t	*lidp;
+	xfs_log_iovec_t		*vecp;
+	uint			nitems;
+
+	/*
+	 * Skip over the entry for the transaction header, we'll
+	 * fill that in at the end.
+	 */
+	vecp = log_vector + 1;		/* pointer arithmetic */
+
+	nitems = 0;
+	lidp = xfs_trans_first_item(tp);
+	ASSERT(lidp != NULL);
+	while (lidp != NULL) {
+		/*
+		 * Skip items which aren't dirty in this transaction.
+		 */
+		if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+			lidp = xfs_trans_next_item(tp, lidp);
+			continue;
+		}
+		/*
+		 * The item may be marked dirty but not log anything.
+		 * This can be used to get called when a transaction
+		 * is committed.
+		 */
+		if (lidp->lid_size) {
+			nitems++;
+		}
+		IOP_FORMAT(lidp->lid_item, vecp);
+		vecp += lidp->lid_size;		/* pointer arithmetic */
+		IOP_PIN(lidp->lid_item);
+		lidp = xfs_trans_next_item(tp, lidp);
+	}
+
+	/*
+	 * Now that we've counted the number of items in this
+	 * transaction, fill in the transaction header.
+	 */
+	tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC;
+	tp->t_header.th_type = tp->t_type;
+	tp->t_header.th_num_items = nitems;
+	log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
+	log_vector->i_len = sizeof(xfs_trans_header_t);
+}
+
+
+/*
+ * Unlock all of the transaction's items and free the transaction.
+ * The transaction must not have modified any of its items, because
+ * there is no way to restore them to their previous state.
+ *
+ * If the transaction has made a log reservation, make sure to release
+ * it as well.
+ */
+void
+xfs_trans_cancel(
+	xfs_trans_t		*tp,
+	int			flags)
+{
+	int			log_flags;
+#ifdef DEBUG
+	xfs_log_item_chunk_t	*licp;
+	xfs_log_item_desc_t	*lidp;
+	xfs_log_item_t		*lip;
+	int			i;
+#endif
+
+	/*
+	 * See if the caller is being too lazy to figure out if
+	 * the transaction really needs an abort.
+	 */
+	if ((flags & XFS_TRANS_ABORT) && !(tp->t_flags & XFS_TRANS_DIRTY))
+		flags &= ~XFS_TRANS_ABORT;
+	/*
+	 * See if the caller is relying on us to shut down the
+	 * filesystem.	This happens in paths where we detect
+	 * corruption and decide to give up.
+	 */
+	if ((tp->t_flags & XFS_TRANS_DIRTY) &&
+	    !XFS_FORCED_SHUTDOWN(tp->t_mountp))
+		xfs_force_shutdown(tp->t_mountp, XFS_CORRUPT_INCORE);
+#ifdef DEBUG
+	if (!(flags & XFS_TRANS_ABORT)) {
+		licp = &(tp->t_items);
+		while (licp != NULL) {
+			lidp = licp->lic_descs;
+			for (i = 0; i < licp->lic_unused; i++, lidp++) {
+				if (XFS_LIC_ISFREE(licp, i)) {
+					continue;
+				}
+
+				lip = lidp->lid_item;
+				if (!XFS_FORCED_SHUTDOWN(tp->t_mountp))
+					ASSERT(!(lip->li_type == XFS_LI_EFD));
+			}
+			licp = licp->lic_next;
+		}
+	}
+#endif
+	xfs_trans_unreserve_and_mod_sb(tp);
+
+	if (tp->t_dqinfo && (tp->t_flags & XFS_TRANS_DQ_DIRTY))
+		xfs_trans_unreserve_and_mod_dquots(tp);
+
+	if (tp->t_ticket) {
+		if (flags & XFS_TRANS_RELEASE_LOG_RES) {
+			ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+			log_flags = XFS_LOG_REL_PERM_RESERV;
+		} else {
+			log_flags = 0;
+		}
+		xfs_log_done(tp->t_mountp, tp->t_ticket, log_flags);
+	}
+	xfs_trans_free_items(tp, flags);
+	xfs_trans_free_busy(tp);
+	xfs_trans_free(tp);
+
+	/* mark this thread as no longer being in a transaction */
+	current->flags &= ~PF_FSTRANS;
+}
+
+
+/*
+ * Free the transaction structure.  If there is more clean up
+ * to do when the structure is freed, add it here.
+ */
+STATIC void
+xfs_trans_free(
+	xfs_trans_t	*tp)
+{
+	atomic_dec(&tp->t_mountp->m_active_trans);
+	if (tp->t_dqinfo)
+		xfs_trans_free_dqinfo(tp);
+	kmem_zone_free(xfs_trans_zone, tp);
+}
+
+
+/*
+ * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item().
+ *
+ * This is typically called by the LM when a transaction has been fully
+ * committed to disk.  It needs to unpin the items which have
+ * been logged by the transaction and update their positions
+ * in the AIL if necessary.
+ * This also gets called when the transactions didn't get written out
+ * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then.
+ *
+ * Call xfs_trans_chunk_committed() to process the items in
+ * each chunk.
+ */
+STATIC void
+xfs_trans_committed(
+	xfs_trans_t	*tp,
+	int		abortflag)
+{
+	xfs_log_item_chunk_t	*licp;
+	xfs_log_item_chunk_t	*next_licp;
+	xfs_log_busy_chunk_t	*lbcp;
+	xfs_log_busy_slot_t	*lbsp;
+	int			i;
+
+	/*
+	 * Call the transaction's completion callback if there
+	 * is one.
+	 */
+	if (tp->t_callback != NULL) {
+		tp->t_callback(tp, tp->t_callarg);
+	}
+
+	/*
+	 * Special case the chunk embedded in the transaction.
+	 */
+	licp = &(tp->t_items);
+	if (!(XFS_LIC_ARE_ALL_FREE(licp))) {
+		xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
+	}
+
+	/*
+	 * Process the items in each chunk in turn.
+	 */
+	licp = licp->lic_next;
+	while (licp != NULL) {
+		ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+		xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
+		next_licp = licp->lic_next;
+		kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+		licp = next_licp;
+	}
+
+	/*
+	 * Clear all the per-AG busy list items listed in this transaction
+	 */
+	lbcp = &tp->t_busy;
+	while (lbcp != NULL) {
+		for (i = 0, lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
+			if (!XFS_LBC_ISFREE(lbcp, i)) {
+				xfs_alloc_clear_busy(tp, lbsp->lbc_ag,
+						     lbsp->lbc_idx);
+			}
+		}
+		lbcp = lbcp->lbc_next;
+	}
+	xfs_trans_free_busy(tp);
+
+	/*
+	 * That's it for the transaction structure.  Free it.
+	 */
+	xfs_trans_free(tp);
+}
+
+/*
+ * This is called to perform the commit processing for each
+ * item described by the given chunk.
+ *
+ * The commit processing consists of unlocking items which were
+ * held locked with the SYNC_UNLOCK attribute, calling the committed
+ * routine of each logged item, updating the item's position in the AIL
+ * if necessary, and unpinning each item.  If the committed routine
+ * returns -1, then do nothing further with the item because it
+ * may have been freed.
+ *
+ * Since items are unlocked when they are copied to the incore
+ * log, it is possible for two transactions to be completing
+ * and manipulating the same item simultaneously.  The AIL lock
+ * will protect the lsn field of each item.  The value of this
+ * field can never go backwards.
+ *
+ * We unpin the items after repositioning them in the AIL, because
+ * otherwise they could be immediately flushed and we'd have to race
+ * with the flusher trying to pull the item from the AIL as we add it.
+ */
+STATIC void
+xfs_trans_chunk_committed(
+	xfs_log_item_chunk_t	*licp,
+	xfs_lsn_t		lsn,
+	int			aborted)
+{
+	xfs_log_item_desc_t	*lidp;
+	xfs_log_item_t		*lip;
+	xfs_lsn_t		item_lsn;
+	struct xfs_mount	*mp;
+	int			i;
+	SPLDECL(s);
+
+	lidp = licp->lic_descs;
+	for (i = 0; i < licp->lic_unused; i++, lidp++) {
+		if (XFS_LIC_ISFREE(licp, i)) {
+			continue;
+		}
+
+		lip = lidp->lid_item;
+		if (aborted)
+			lip->li_flags |= XFS_LI_ABORTED;
+
+		if (lidp->lid_flags & XFS_LID_SYNC_UNLOCK) {
+			IOP_UNLOCK(lip);
+		}
+
+		/*
+		 * Send in the ABORTED flag to the COMMITTED routine
+		 * so that it knows whether the transaction was aborted
+		 * or not.
+		 */
+		item_lsn = IOP_COMMITTED(lip, lsn);
+
+		/*
+		 * If the committed routine returns -1, make
+		 * no more references to the item.
+		 */
+		if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) {
+			continue;
+		}
+
+		/*
+		 * If the returned lsn is greater than what it
+		 * contained before, update the location of the
+		 * item in the AIL.  If it is not, then do nothing.
+		 * Items can never move backwards in the AIL.
+		 *
+		 * While the new lsn should usually be greater, it
+		 * is possible that a later transaction completing
+		 * simultaneously with an earlier one using the
+		 * same item could complete first with a higher lsn.
+		 * This would cause the earlier transaction to fail
+		 * the test below.
+		 */
+		mp = lip->li_mountp;
+		AIL_LOCK(mp,s);
+		if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
+			/*
+			 * This will set the item's lsn to item_lsn
+			 * and update the position of the item in
+			 * the AIL.
+			 *
+			 * xfs_trans_update_ail() drops the AIL lock.
+			 */
+			xfs_trans_update_ail(mp, lip, item_lsn, s);
+		} else {
+			AIL_UNLOCK(mp, s);
+		}
+
+		/*
+		 * Now that we've repositioned the item in the AIL,
+		 * unpin it so it can be flushed.
+		 */
+		IOP_UNPIN(lip);
+	}
+}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
new file mode 100644
index 000000000000..1845dd874a4b
--- /dev/null
+++ b/fs/xfs/xfs_trans.h
@@ -0,0 +1,1034 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_TRANS_H__
+#define __XFS_TRANS_H__
+
+/*
+ * This is the structure written in the log at the head of
+ * every transaction. It identifies the type and id of the
+ * transaction, and contains the number of items logged by
+ * the transaction so we know how many to expect during recovery.
+ *
+ * Do not change the below structure without redoing the code in
+ * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
+ */
+typedef struct xfs_trans_header {
+	uint		th_magic;		/* magic number */
+	uint		th_type;		/* transaction type */
+	__int32_t	th_tid;			/* transaction id (unused) */
+	uint		th_num_items;		/* num items logged by trans */
+} xfs_trans_header_t;
+
+#define XFS_TRANS_HEADER_MAGIC	0x5452414e	/* TRAN */
+
+/*
+ * Log item types.
+ */
+#define XFS_LI_5_3_BUF		0x1234	/* v1 bufs, 1-block inode buffers */
+#define XFS_LI_5_3_INODE	0x1235	/* 1-block inode buffers */
+#define XFS_LI_EFI		0x1236
+#define XFS_LI_EFD		0x1237
+#define XFS_LI_IUNLINK		0x1238
+#define XFS_LI_6_1_INODE	0x1239	/* 4K non-aligned inode bufs */
+#define XFS_LI_6_1_BUF		0x123a	/* v1, 4K inode buffers */
+#define XFS_LI_INODE		0x123b	/* aligned ino chunks, var-size ibufs */
+#define XFS_LI_BUF		0x123c	/* v2 bufs, variable sized inode bufs */
+#define XFS_LI_DQUOT		0x123d
+#define XFS_LI_QUOTAOFF		0x123e
+#define XFS_LI_RPC		0x123f	/* CXFS RPC return info */
+
+/*
+ * Transaction types.  Used to distinguish types of buffers.
+ */
+#define XFS_TRANS_SETATTR_NOT_SIZE	1
+#define XFS_TRANS_SETATTR_SIZE		2
+#define XFS_TRANS_INACTIVE		3
+#define XFS_TRANS_CREATE		4
+#define XFS_TRANS_CREATE_TRUNC		5
+#define XFS_TRANS_TRUNCATE_FILE		6
+#define XFS_TRANS_REMOVE		7
+#define XFS_TRANS_LINK			8
+#define XFS_TRANS_RENAME		9
+#define XFS_TRANS_MKDIR			10
+#define XFS_TRANS_RMDIR			11
+#define XFS_TRANS_SYMLINK		12
+#define XFS_TRANS_SET_DMATTRS		13
+#define XFS_TRANS_GROWFS		14
+#define XFS_TRANS_STRAT_WRITE		15
+#define XFS_TRANS_DIOSTRAT		16
+#define XFS_TRANS_WRITE_SYNC		17
+#define XFS_TRANS_WRITEID		18
+#define XFS_TRANS_ADDAFORK		19
+#define XFS_TRANS_ATTRINVAL		20
+#define XFS_TRANS_ATRUNCATE		21
+#define XFS_TRANS_ATTR_SET		22
+#define XFS_TRANS_ATTR_RM		23
+#define XFS_TRANS_ATTR_FLAG		24
+#define XFS_TRANS_CLEAR_AGI_BUCKET	25
+#define XFS_TRANS_QM_SBCHANGE		26
+/*
+ * Dummy entries since we use the transaction type to index into the
+ * trans_type[] in xlog_recover_print_trans_head()
+ */
+#define XFS_TRANS_DUMMY1		27
+#define XFS_TRANS_DUMMY2		28
+#define XFS_TRANS_QM_QUOTAOFF		29
+#define XFS_TRANS_QM_DQALLOC		30
+#define XFS_TRANS_QM_SETQLIM		31
+#define XFS_TRANS_QM_DQCLUSTER		32
+#define XFS_TRANS_QM_QINOCREATE		33
+#define XFS_TRANS_QM_QUOTAOFF_END	34
+#define XFS_TRANS_SB_UNIT		35
+#define XFS_TRANS_FSYNC_TS		36
+#define XFS_TRANS_GROWFSRT_ALLOC	37
+#define XFS_TRANS_GROWFSRT_ZERO		38
+#define XFS_TRANS_GROWFSRT_FREE		39
+#define XFS_TRANS_SWAPEXT		40
+/* new transaction types need to be reflected in xfs_logprint(8) */
+
+
+#ifdef __KERNEL__
+struct xfs_buf;
+struct xfs_buftarg;
+struct xfs_efd_log_item;
+struct xfs_efi_log_item;
+struct xfs_inode;
+struct xfs_item_ops;
+struct xfs_log_iovec;
+struct xfs_log_item;
+struct xfs_log_item_desc;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot_acct;
+
+typedef struct xfs_ail_entry {
+	struct xfs_log_item	*ail_forw;	/* AIL forw pointer */
+	struct xfs_log_item	*ail_back;	/* AIL back pointer */
+} xfs_ail_entry_t;
+
+/*
+ * This structure is passed as a parameter to xfs_trans_push_ail()
+ * and is used to track the what LSN the waiting processes are
+ * waiting to become unused.
+ */
+typedef struct xfs_ail_ticket {
+	xfs_lsn_t		at_lsn;		/* lsn waitin for */
+	struct xfs_ail_ticket	*at_forw;	/* wait list ptr */
+	struct xfs_ail_ticket	*at_back;	/* wait list ptr */
+	sv_t			at_sema;	/* wait sema */
+} xfs_ail_ticket_t;
+
+
+typedef struct xfs_log_item {
+	xfs_ail_entry_t			li_ail;		/* AIL pointers */
+	xfs_lsn_t			li_lsn;		/* last on-disk lsn */
+	struct xfs_log_item_desc	*li_desc;	/* ptr to current desc*/
+	struct xfs_mount		*li_mountp;	/* ptr to fs mount */
+	uint				li_type;	/* item type */
+	uint				li_flags;	/* misc flags */
+	struct xfs_log_item		*li_bio_list;	/* buffer item list */
+	void				(*li_cb)(struct xfs_buf *,
+						 struct xfs_log_item *);
+							/* buffer item iodone */
+							/* callback func */
+	struct xfs_item_ops		*li_ops;	/* function list */
+} xfs_log_item_t;
+
+#define XFS_LI_IN_AIL	0x1
+#define XFS_LI_ABORTED	0x2
+
+typedef struct xfs_item_ops {
+	uint (*iop_size)(xfs_log_item_t *);
+	void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
+	void (*iop_pin)(xfs_log_item_t *);
+	void (*iop_unpin)(xfs_log_item_t *);
+	void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
+	uint (*iop_trylock)(xfs_log_item_t *);
+	void (*iop_unlock)(xfs_log_item_t *);
+	xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
+	void (*iop_push)(xfs_log_item_t *);
+	void (*iop_abort)(xfs_log_item_t *);
+	void (*iop_pushbuf)(xfs_log_item_t *);
+	void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
+} xfs_item_ops_t;
+
+#define IOP_SIZE(ip)		(*(ip)->li_ops->iop_size)(ip)
+#define IOP_FORMAT(ip,vp)	(*(ip)->li_ops->iop_format)(ip, vp)
+#define IOP_PIN(ip)		(*(ip)->li_ops->iop_pin)(ip)
+#define IOP_UNPIN(ip)		(*(ip)->li_ops->iop_unpin)(ip)
+#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
+#define IOP_TRYLOCK(ip)		(*(ip)->li_ops->iop_trylock)(ip)
+#define IOP_UNLOCK(ip)		(*(ip)->li_ops->iop_unlock)(ip)
+#define IOP_COMMITTED(ip, lsn)	(*(ip)->li_ops->iop_committed)(ip, lsn)
+#define IOP_PUSH(ip)		(*(ip)->li_ops->iop_push)(ip)
+#define IOP_ABORT(ip)		(*(ip)->li_ops->iop_abort)(ip)
+#define IOP_PUSHBUF(ip)		(*(ip)->li_ops->iop_pushbuf)(ip)
+#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
+
+/*
+ * Return values for the IOP_TRYLOCK() routines.
+ */
+#define XFS_ITEM_SUCCESS	0
+#define XFS_ITEM_PINNED		1
+#define XFS_ITEM_LOCKED		2
+#define XFS_ITEM_FLUSHING	3
+#define XFS_ITEM_PUSHBUF	4
+
+#endif	/* __KERNEL__ */
+
+/*
+ * This structure is used to track log items associated with
+ * a transaction.  It points to the log item and keeps some
+ * flags to track the state of the log item.  It also tracks
+ * the amount of space needed to log the item it describes
+ * once we get to commit processing (see xfs_trans_commit()).
+ */
+typedef struct xfs_log_item_desc {
+	xfs_log_item_t	*lid_item;
+	ushort		lid_size;
+	unsigned char	lid_flags;
+	unsigned char	lid_index;
+} xfs_log_item_desc_t;
+
+#define XFS_LID_DIRTY		0x1
+#define XFS_LID_PINNED		0x2
+#define XFS_LID_SYNC_UNLOCK	0x4
+
+/*
+ * This structure is used to maintain a chunk list of log_item_desc
+ * structures. The free field is a bitmask indicating which descriptors
+ * in this chunk's array are free.  The unused field is the first value
+ * not used since this chunk was allocated.
+ */
+#define XFS_LIC_NUM_SLOTS	15
+typedef struct xfs_log_item_chunk {
+	struct xfs_log_item_chunk	*lic_next;
+	ushort				lic_free;
+	ushort				lic_unused;
+	xfs_log_item_desc_t		lic_descs[XFS_LIC_NUM_SLOTS];
+} xfs_log_item_chunk_t;
+
+#define XFS_LIC_MAX_SLOT	(XFS_LIC_NUM_SLOTS - 1)
+#define XFS_LIC_FREEMASK	((1 << XFS_LIC_NUM_SLOTS) - 1)
+
+
+/*
+ * Initialize the given chunk.	Set the chunk's free descriptor mask
+ * to indicate that all descriptors are free.  The caller gets to set
+ * lic_unused to the right value (0 matches all free).	The
+ * lic_descs.lid_index values are set up as each desc is allocated.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_INIT)
+void xfs_lic_init(xfs_log_item_chunk_t *cp);
+#define XFS_LIC_INIT(cp)	xfs_lic_init(cp)
+#else
+#define XFS_LIC_INIT(cp)	((cp)->lic_free = XFS_LIC_FREEMASK)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_INIT_SLOT)
+void xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot);
+#define XFS_LIC_INIT_SLOT(cp,slot)	xfs_lic_init_slot(cp, slot)
+#else
+#define XFS_LIC_INIT_SLOT(cp,slot)	\
+	((cp)->lic_descs[slot].lid_index = (unsigned char)(slot))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_VACANCY)
+int xfs_lic_vacancy(xfs_log_item_chunk_t *cp);
+#define XFS_LIC_VACANCY(cp)		xfs_lic_vacancy(cp)
+#else
+#define XFS_LIC_VACANCY(cp)		(((cp)->lic_free) & XFS_LIC_FREEMASK)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_ALL_FREE)
+void xfs_lic_all_free(xfs_log_item_chunk_t *cp);
+#define XFS_LIC_ALL_FREE(cp)		xfs_lic_all_free(cp)
+#else
+#define XFS_LIC_ALL_FREE(cp)		((cp)->lic_free = XFS_LIC_FREEMASK)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_ARE_ALL_FREE)
+int xfs_lic_are_all_free(xfs_log_item_chunk_t *cp);
+#define XFS_LIC_ARE_ALL_FREE(cp)	xfs_lic_are_all_free(cp)
+#else
+#define XFS_LIC_ARE_ALL_FREE(cp)	(((cp)->lic_free & XFS_LIC_FREEMASK) ==\
+					XFS_LIC_FREEMASK)
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_ISFREE)
+int xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot);
+#define XFS_LIC_ISFREE(cp,slot) xfs_lic_isfree(cp,slot)
+#else
+#define XFS_LIC_ISFREE(cp,slot) ((cp)->lic_free & (1 << (slot)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_CLAIM)
+void xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot);
+#define XFS_LIC_CLAIM(cp,slot)		xfs_lic_claim(cp,slot)
+#else
+#define XFS_LIC_CLAIM(cp,slot)		((cp)->lic_free &= ~(1 << (slot)))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_RELSE)
+void xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot);
+#define XFS_LIC_RELSE(cp,slot)		xfs_lic_relse(cp,slot)
+#else
+#define XFS_LIC_RELSE(cp,slot)		((cp)->lic_free |= 1 << (slot))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_SLOT)
+xfs_log_item_desc_t *xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot);
+#define XFS_LIC_SLOT(cp,slot)		xfs_lic_slot(cp,slot)
+#else
+#define XFS_LIC_SLOT(cp,slot)		(&((cp)->lic_descs[slot]))
+#endif
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_DESC_TO_SLOT)
+int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp);
+#define XFS_LIC_DESC_TO_SLOT(dp)	xfs_lic_desc_to_slot(dp)
+#else
+#define XFS_LIC_DESC_TO_SLOT(dp)	((uint)((dp)->lid_index))
+#endif
+/*
+ * Calculate the address of a chunk given a descriptor pointer:
+ * dp - dp->lid_index give the address of the start of the lic_descs array.
+ * From this we subtract the offset of the lic_descs field in a chunk.
+ * All of this yields the address of the chunk, which is
+ * cast to a chunk pointer.
+ */
+#if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_LIC_DESC_TO_CHUNK)
+xfs_log_item_chunk_t *xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp);
+#define XFS_LIC_DESC_TO_CHUNK(dp)	xfs_lic_desc_to_chunk(dp)
+#else
+#define XFS_LIC_DESC_TO_CHUNK(dp)	((xfs_log_item_chunk_t*) \
+					(((xfs_caddr_t)((dp) - (dp)->lid_index)) -\
+					(xfs_caddr_t)(((xfs_log_item_chunk_t*) \
+					0)->lic_descs)))
+#endif
+
+#ifdef __KERNEL__
+/*
+ * This structure is used to maintain a list of block ranges that have been
+ * freed in the transaction.  The ranges are listed in the perag[] busy list
+ * between when they're freed and the transaction is committed to disk.
+ */
+
+typedef struct xfs_log_busy_slot {
+	xfs_agnumber_t		lbc_ag;
+	ushort			lbc_idx;	/* index in perag.busy[] */
+} xfs_log_busy_slot_t;
+
+#define XFS_LBC_NUM_SLOTS	31
+typedef struct xfs_log_busy_chunk {
+	struct xfs_log_busy_chunk	*lbc_next;
+	uint				lbc_free;	/* bitmask of free slots */
+	ushort				lbc_unused;	/* first unused */
+	xfs_log_busy_slot_t		lbc_busy[XFS_LBC_NUM_SLOTS];
+} xfs_log_busy_chunk_t;
+
+#define XFS_LBC_MAX_SLOT	(XFS_LBC_NUM_SLOTS - 1)
+#define XFS_LBC_FREEMASK	((1U << XFS_LBC_NUM_SLOTS) - 1)
+
+#define XFS_LBC_INIT(cp)	((cp)->lbc_free = XFS_LBC_FREEMASK)
+#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
+#define XFS_LBC_SLOT(cp, slot)	(&((cp)->lbc_busy[(slot)]))
+#define XFS_LBC_VACANCY(cp)	(((cp)->lbc_free) & XFS_LBC_FREEMASK)
+#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
+
+/*
+ * This is the type of function which can be given to xfs_trans_callback()
+ * to be called upon the transaction's commit to disk.
+ */
+typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
+
+/*
+ * This is the structure maintained for every active transaction.
+ */
+typedef struct xfs_trans {
+	unsigned int		t_magic;	/* magic number */
+	xfs_log_callback_t	t_logcb;	/* log callback struct */
+	struct xfs_trans	*t_forw;	/* async list pointers */
+	struct xfs_trans	*t_back;	/* async list pointers */
+	unsigned int		t_type;		/* transaction type */
+	unsigned int		t_log_res;	/* amt of log space resvd */
+	unsigned int		t_log_count;	/* count for perm log res */
+	unsigned int		t_blk_res;	/* # of blocks resvd */
+	unsigned int		t_blk_res_used; /* # of resvd blocks used */
+	unsigned int		t_rtx_res;	/* # of rt extents resvd */
+	unsigned int		t_rtx_res_used; /* # of resvd rt extents used */
+	xfs_log_ticket_t	t_ticket;	/* log mgr ticket */
+	sema_t			t_sema;		/* sema for commit completion */
+	xfs_lsn_t		t_lsn;		/* log seq num of trans commit*/
+	struct xfs_mount	*t_mountp;	/* ptr to fs mount struct */
+	struct xfs_dquot_acct	*t_dqinfo;	/* accting info for dquots */
+	xfs_trans_callback_t	t_callback;	/* transaction callback */
+	void			*t_callarg;	/* callback arg */
+	unsigned int		t_flags;	/* misc flags */
+	long			t_icount_delta; /* superblock icount change */
+	long			t_ifree_delta;	/* superblock ifree change */
+	long			t_fdblocks_delta; /* superblock fdblocks chg */
+	long			t_res_fdblocks_delta; /* on-disk only chg */
+	long			t_frextents_delta;/* superblock freextents chg*/
+	long			t_res_frextents_delta; /* on-disk only chg */
+	long			t_ag_freeblks_delta; /* debugging counter */
+	long			t_ag_flist_delta; /* debugging counter */
+	long			t_ag_btree_delta; /* debugging counter */
+	long			t_dblocks_delta;/* superblock dblocks change */
+	long			t_agcount_delta;/* superblock agcount change */
+	long			t_imaxpct_delta;/* superblock imaxpct change */
+	long			t_rextsize_delta;/* superblock rextsize chg */
+	long			t_rbmblocks_delta;/* superblock rbmblocks chg */
+	long			t_rblocks_delta;/* superblock rblocks change */
+	long			t_rextents_delta;/* superblocks rextents chg */
+	long			t_rextslog_delta;/* superblocks rextslog chg */
+	unsigned int		t_items_free;	/* log item descs free */
+	xfs_log_item_chunk_t	t_items;	/* first log item desc chunk */
+	xfs_trans_header_t	t_header;	/* header for in-log trans */
+	unsigned int		t_busy_free;	/* busy descs free */
+	xfs_log_busy_chunk_t	t_busy;		/* busy/async free blocks */
+} xfs_trans_t;
+
+#endif	/* __KERNEL__ */
+
+
+#define XFS_TRANS_MAGIC		0x5452414E	/* 'TRAN' */
+/*
+ * Values for t_flags.
+ */
+#define XFS_TRANS_DIRTY		0x01	/* something needs to be logged */
+#define XFS_TRANS_SB_DIRTY	0x02	/* superblock is modified */
+#define XFS_TRANS_PERM_LOG_RES	0x04	/* xact took a permanent log res */
+#define XFS_TRANS_SYNC		0x08	/* make commit synchronous */
+#define XFS_TRANS_DQ_DIRTY	0x10	/* at least one dquot in trx dirty */
+#define XFS_TRANS_RESERVE	0x20	/* OK to use reserved data blocks */
+
+/*
+ * Values for call flags parameter.
+ */
+#define XFS_TRANS_NOSLEEP		0x1
+#define XFS_TRANS_WAIT			0x2
+#define XFS_TRANS_RELEASE_LOG_RES	0x4
+#define XFS_TRANS_ABORT			0x8
+
+/*
+ * Field values for xfs_trans_mod_sb.
+ */
+#define XFS_TRANS_SB_ICOUNT		0x00000001
+#define XFS_TRANS_SB_IFREE		0x00000002
+#define XFS_TRANS_SB_FDBLOCKS		0x00000004
+#define XFS_TRANS_SB_RES_FDBLOCKS	0x00000008
+#define XFS_TRANS_SB_FREXTENTS		0x00000010
+#define XFS_TRANS_SB_RES_FREXTENTS	0x00000020
+#define XFS_TRANS_SB_DBLOCKS		0x00000040
+#define XFS_TRANS_SB_AGCOUNT		0x00000080
+#define XFS_TRANS_SB_IMAXPCT		0x00000100
+#define XFS_TRANS_SB_REXTSIZE		0x00000200
+#define XFS_TRANS_SB_RBMBLOCKS		0x00000400
+#define XFS_TRANS_SB_RBLOCKS		0x00000800
+#define XFS_TRANS_SB_REXTENTS		0x00001000
+#define XFS_TRANS_SB_REXTSLOG		0x00002000
+
+
+/*
+ * Various log reservation values.
+ * These are based on the size of the file system block
+ * because that is what most transactions manipulate.
+ * Each adds in an additional 128 bytes per item logged to
+ * try to account for the overhead of the transaction mechanism.
+ *
+ * Note:
+ * Most of the reservations underestimate the number of allocation
+ * groups into which they could free extents in the xfs_bmap_finish()
+ * call.  This is because the number in the worst case is quite high
+ * and quite unusual.  In order to fix this we need to change
+ * xfs_bmap_finish() to free extents in only a single AG at a time.
+ * This will require changes to the EFI code as well, however, so that
+ * the EFI for the extents not freed is logged again in each transaction.
+ * See bug 261917.
+ */
+
+/*
+ * Per-extent log reservation for the allocation btree changes
+ * involved in freeing or allocating an extent.
+ * 2 trees * (2 blocks/level * max depth - 1) * block size
+ */
+#define XFS_ALLOCFREE_LOG_RES(mp,nx) \
+	((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
+#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
+	((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
+
+/*
+ * Per-directory log reservation for any directory change.
+ * dir blocks: (1 btree block per level + data block + free block) * dblock size
+ * bmap btree: (levels + 2) * max depth * block size
+ * v2 directory blocks can be fragmented below the dirblksize down to the fsb
+ * size, so account for that in the DAENTER macros.
+ */
+#define XFS_DIROP_LOG_RES(mp)	\
+	(XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
+	 (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
+#define XFS_DIROP_LOG_COUNT(mp) \
+	(XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
+	 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
+
+/*
+ * In a write transaction we can allocate a maximum of 2
+ * extents.  This gives:
+ *    the inode getting the new extents: inode size
+ *    the inode\'s bmap btree: max depth * block size
+ *    the agfs of the ags from which the extents are allocated: 2 * sector
+ *    the superblock free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And the bmap_finish transaction can free bmap blocks in a join:
+ *    the agfs of the ags containing the blocks: 2 * sector size
+ *    the agfls of the ags containing the blocks: 2 * sector size
+ *    the super block free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+#define XFS_CALC_WRITE_LOG_RES(mp) \
+	(MAX( \
+	 ((mp)->m_sb.sb_inodesize + \
+	  XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
+	  (2 * (mp)->m_sb.sb_sectsize) + \
+	  (mp)->m_sb.sb_sectsize + \
+	  XFS_ALLOCFREE_LOG_RES(mp, 2) + \
+	  (128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))),\
+	 ((2 * (mp)->m_sb.sb_sectsize) + \
+	  (2 * (mp)->m_sb.sb_sectsize) + \
+	  (mp)->m_sb.sb_sectsize + \
+	  XFS_ALLOCFREE_LOG_RES(mp, 2) + \
+	  (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
+
+#define XFS_WRITE_LOG_RES(mp)	((mp)->m_reservations.tr_write)
+
+/*
+ * In truncating a file we free up to two extents at once.  We can modify:
+ *    the inode being truncated: inode size
+ *    the inode\'s bmap btree: (max depth + 1) * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *		4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+#define XFS_CALC_ITRUNCATE_LOG_RES(mp) \
+	(MAX( \
+	 ((mp)->m_sb.sb_inodesize + \
+	  XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + \
+	  (128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
+	 ((4 * (mp)->m_sb.sb_sectsize) + \
+	  (4 * (mp)->m_sb.sb_sectsize) + \
+	  (mp)->m_sb.sb_sectsize + \
+	  XFS_ALLOCFREE_LOG_RES(mp, 4) + \
+	  (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))))))
+
+#define XFS_ITRUNCATE_LOG_RES(mp)   ((mp)->m_reservations.tr_itruncate)
+
+/*
+ * In renaming a files we can modify:
+ *    the four inodes involved: 4 * inode size
+ *    the two directory btrees: 2 * (max depth + v2) * dir block size
+ *    the two directory bmap btrees: 2 * max depth * block size
+ * And the bmap_finish transaction can free dir and bmap blocks (two sets
+ *	of bmap blocks) giving:
+ *    the agf for the ags in which the blocks live: 3 * sector size
+ *    the agfl for the ags in which the blocks live: 3 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+#define XFS_CALC_RENAME_LOG_RES(mp) \
+	(MAX( \
+	 ((4 * (mp)->m_sb.sb_inodesize) + \
+	  (2 * XFS_DIROP_LOG_RES(mp)) + \
+	  (128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp)))), \
+	 ((3 * (mp)->m_sb.sb_sectsize) + \
+	  (3 * (mp)->m_sb.sb_sectsize) + \
+	  (mp)->m_sb.sb_sectsize + \
+	  XFS_ALLOCFREE_LOG_RES(mp, 3) + \
+	  (128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))))))
+
+#define XFS_RENAME_LOG_RES(mp)	((mp)->m_reservations.tr_rename)
+
+/*
+ * For creating a link to an inode:
+ *    the parent directory inode: inode size
+ *    the linked inode: inode size
+ *    the directory btree could split: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free some bmap blocks giving:
+ *    the agf for the ag in which the blocks live: sector size
+ *    the agfl for the ag in which the blocks live: sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
+#define XFS_CALC_LINK_LOG_RES(mp) \
+	(MAX( \
+	 ((mp)->m_sb.sb_inodesize + \
+	  (mp)->m_sb.sb_inodesize + \
+	  XFS_DIROP_LOG_RES(mp) + \
+	  (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
+	 ((mp)->m_sb.sb_sectsize + \
+	  (mp)->m_sb.sb_sectsize + \
+	  (mp)->m_sb.sb_sectsize + \
+	  XFS_ALLOCFREE_LOG_RES(mp, 1) + \
+	  (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
+
+#define XFS_LINK_LOG_RES(mp)	((mp)->m_reservations.tr_link)
+
+/*
+ * For removing a directory entry we can modify:
+ *    the parent directory inode: inode size
+ *    the removed inode: inode size
+ *    the directory btree could join: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free the dir and bmap blocks giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+#define XFS_CALC_REMOVE_LOG_RES(mp)	\
+	(MAX( \
+	 ((mp)->m_sb.sb_inodesize + \
+	  (mp)->m_sb.sb_inodesize + \
+	  XFS_DIROP_LOG_RES(mp) + \
+	  (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
+	 ((2 * (mp)->m_sb.sb_sectsize) + \
+	  (2 * (mp)->m_sb.sb_sectsize) + \
+	  (mp)->m_sb.sb_sectsize + \
+	  XFS_ALLOCFREE_LOG_RES(mp, 2) + \
+	  (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
+
+#define XFS_REMOVE_LOG_RES(mp)	((mp)->m_reservations.tr_remove)
+
+/*
+ * For symlink we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: 1 block
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode\'s bmap btree: (max depth + v2) * block size
+ *    the blocks for the symlink: 1 KB
+ * Or in the first xact we allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
+#define XFS_CALC_SYMLINK_LOG_RES(mp)		\
+	(MAX( \
+	 ((mp)->m_sb.sb_inodesize + \
+	  (mp)->m_sb.sb_inodesize + \
+	  XFS_FSB_TO_B(mp, 1) + \
+	  XFS_DIROP_LOG_RES(mp) + \
+	  1024 + \
+	  (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \
+	 (2 * (mp)->m_sb.sb_sectsize + \
+	  XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
+	  XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \
+	  XFS_ALLOCFREE_LOG_RES(mp, 1) + \
+	  (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+	   XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
+
+#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
+
+/*
+ * For create we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: block size
+ *    the superblock for the nlink flag: sector size
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode\'s bmap btree: (max depth + v2) * block size
+ * Or in the first xact we allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
+ *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
+#define XFS_CALC_CREATE_LOG_RES(mp)		\
+	(MAX( \
+	 ((mp)->m_sb.sb_inodesize + \
+	  (mp)->m_sb.sb_inodesize + \
+	  (mp)->m_sb.sb_sectsize + \
+	  XFS_FSB_TO_B(mp, 1) + \
+	  XFS_DIROP_LOG_RES(mp) + \
+	  (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \
+	 (3 * (mp)->m_sb.sb_sectsize + \
+	  XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
+	  XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \
+	  XFS_ALLOCFREE_LOG_RES(mp, 1) + \
+	  (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \
+	   XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
+
+#define XFS_CREATE_LOG_RES(mp)	((mp)->m_reservations.tr_create)
+
+/*
+ * Making a new directory is the same as creating a new file.
+ */
+#define XFS_CALC_MKDIR_LOG_RES(mp)	XFS_CALC_CREATE_LOG_RES(mp)
+
+#define XFS_MKDIR_LOG_RES(mp)	((mp)->m_reservations.tr_mkdir)
+
+/*
+ * In freeing an inode we can modify:
+ *    the inode being freed: inode size
+ *    the super block free inode counter: sector size
+ *    the agi hash list and counters: sector size
+ *    the inode btree entry: block size
+ *    the on disk inode before ours in the agi hash list: inode cluster size
+ */
+#define XFS_CALC_IFREE_LOG_RES(mp) \
+	((mp)->m_sb.sb_inodesize + \
+	 (mp)->m_sb.sb_sectsize + \
+	 (mp)->m_sb.sb_sectsize + \
+	 XFS_FSB_TO_B((mp), 1) + \
+	 MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \
+	 (128 * 5))
+
+#define XFS_IFREE_LOG_RES(mp)	((mp)->m_reservations.tr_ifree)
+
+/*
+ * When only changing the inode we log the inode and possibly the superblock
+ * We also add a bit of slop for the transaction stuff.
+ */
+#define XFS_CALC_ICHANGE_LOG_RES(mp)	((mp)->m_sb.sb_inodesize + \
+					 (mp)->m_sb.sb_sectsize + 512)
+
+#define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange)
+
+/*
+ * Growing the data section of the filesystem.
+ *	superblock
+ *	agi and agf
+ *	allocation btrees
+ */
+#define XFS_CALC_GROWDATA_LOG_RES(mp) \
+	((mp)->m_sb.sb_sectsize * 3 + \
+	 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
+	 (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
+
+#define XFS_GROWDATA_LOG_RES(mp)    ((mp)->m_reservations.tr_growdata)
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the first set of transactions (ALLOC) we allocate space to the
+ * bitmap or summary files.
+ *	superblock: sector size
+ *	agf of the ag from which the extent is allocated: sector size
+ *	bmap btree for bitmap/summary inode: max depth * blocksize
+ *	bitmap/summary inode: inode size
+ *	allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
+ */
+#define XFS_CALC_GROWRTALLOC_LOG_RES(mp) \
+	(2 * (mp)->m_sb.sb_sectsize + \
+	 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
+	 (mp)->m_sb.sb_inodesize + \
+	 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
+	 (128 * \
+	  (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + \
+	   XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
+
+#define XFS_GROWRTALLOC_LOG_RES(mp)	((mp)->m_reservations.tr_growrtalloc)
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the second set of transactions (ZERO) we zero the new metadata blocks.
+ *	one bitmap/summary block: blocksize
+ */
+#define XFS_CALC_GROWRTZERO_LOG_RES(mp) \
+	((mp)->m_sb.sb_blocksize + 128)
+
+#define XFS_GROWRTZERO_LOG_RES(mp)	((mp)->m_reservations.tr_growrtzero)
+
+/*
+ * Growing the rt section of the filesystem.
+ * In the third set of transactions (FREE) we update metadata without
+ * allocating any new blocks.
+ *	superblock: sector size
+ *	bitmap inode: inode size
+ *	summary inode: inode size
+ *	one bitmap block: blocksize
+ *	summary blocks: new summary size
+ */
+#define XFS_CALC_GROWRTFREE_LOG_RES(mp) \
+	((mp)->m_sb.sb_sectsize + \
+	 2 * (mp)->m_sb.sb_inodesize + \
+	 (mp)->m_sb.sb_blocksize + \
+	 (mp)->m_rsumsize + \
+	 (128 * 5))
+
+#define XFS_GROWRTFREE_LOG_RES(mp)	((mp)->m_reservations.tr_growrtfree)
+
+/*
+ * Logging the inode modification timestamp on a synchronous write.
+ *	inode
+ */
+#define XFS_CALC_SWRITE_LOG_RES(mp) \
+	((mp)->m_sb.sb_inodesize + 128)
+
+#define XFS_SWRITE_LOG_RES(mp)	((mp)->m_reservations.tr_swrite)
+
+/*
+ * Logging the inode timestamps on an fsync -- same as SWRITE
+ * as long as SWRITE logs the entire inode core
+ */
+#define XFS_FSYNC_TS_LOG_RES(mp)	((mp)->m_reservations.tr_swrite)
+
+/*
+ * Logging the inode mode bits when writing a setuid/setgid file
+ *	inode
+ */
+#define XFS_CALC_WRITEID_LOG_RES(mp) \
+	((mp)->m_sb.sb_inodesize + 128)
+
+#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
+
+/*
+ * Converting the inode from non-attributed to attributed.
+ *	the inode being converted: inode size
+ *	agf block and superblock (for block allocation)
+ *	the new block (directory sized)
+ *	bmap blocks for the new directory block
+ *	allocation btrees
+ */
+#define XFS_CALC_ADDAFORK_LOG_RES(mp)	\
+	((mp)->m_sb.sb_inodesize + \
+	 (mp)->m_sb.sb_sectsize * 2 + \
+	 (mp)->m_dirblksize + \
+	 (XFS_DIR_IS_V1(mp) ? 0 : \
+	    XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1))) + \
+	 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
+	 (128 * (4 + \
+		 (XFS_DIR_IS_V1(mp) ? 0 : \
+			 XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \
+		 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
+
+#define XFS_ADDAFORK_LOG_RES(mp)	((mp)->m_reservations.tr_addafork)
+
+/*
+ * Removing the attribute fork of a file
+ *    the inode being truncated: inode size
+ *    the inode\'s bmap btree: max depth * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *		4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+#define XFS_CALC_ATTRINVAL_LOG_RES(mp)	\
+	(MAX( \
+	 ((mp)->m_sb.sb_inodesize + \
+	  XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
+	  (128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))), \
+	 ((4 * (mp)->m_sb.sb_sectsize) + \
+	  (4 * (mp)->m_sb.sb_sectsize) + \
+	  (mp)->m_sb.sb_sectsize + \
+	  XFS_ALLOCFREE_LOG_RES(mp, 4) + \
+	  (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))))))
+
+#define XFS_ATTRINVAL_LOG_RES(mp)	((mp)->m_reservations.tr_attrinval)
+
+/*
+ * Setting an attribute.
+ *	the inode getting the attribute
+ *	the superblock for allocations
+ *	the agfs extents are allocated from
+ *	the attribute btree * max depth
+ *	the inode allocation btree
+ * Since attribute transaction space is dependent on the size of the attribute,
+ * the calculation is done partially at mount time and partially at runtime.
+ */
+#define XFS_CALC_ATTRSET_LOG_RES(mp)	\
+	((mp)->m_sb.sb_inodesize + \
+	 (mp)->m_sb.sb_sectsize + \
+	  XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
+	  (128 * (2 + XFS_DA_NODE_MAXDEPTH)))
+
+#define XFS_ATTRSET_LOG_RES(mp, ext)	\
+	((mp)->m_reservations.tr_attrset + \
+	 (ext * (mp)->m_sb.sb_sectsize) + \
+	 (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \
+	 (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))))
+
+/*
+ * Removing an attribute.
+ *    the inode: inode size
+ *    the attribute btree could join: max depth * block size
+ *    the inode bmap btree could join or split: max depth * block size
+ * And the bmap_finish transaction can free the attr blocks freed giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+#define XFS_CALC_ATTRRM_LOG_RES(mp)	\
+	(MAX( \
+	  ((mp)->m_sb.sb_inodesize + \
+	  XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
+	  XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
+	  (128 * (1 + XFS_DA_NODE_MAXDEPTH + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
+	 ((2 * (mp)->m_sb.sb_sectsize) + \
+	  (2 * (mp)->m_sb.sb_sectsize) + \
+	  (mp)->m_sb.sb_sectsize + \
+	  XFS_ALLOCFREE_LOG_RES(mp, 2) + \
+	  (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
+
+#define XFS_ATTRRM_LOG_RES(mp)	((mp)->m_reservations.tr_attrrm)
+
+/*
+ * Clearing a bad agino number in an agi hash bucket.
+ */
+#define XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp) \
+	((mp)->m_sb.sb_sectsize + 128)
+
+#define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp)  ((mp)->m_reservations.tr_clearagi)
+
+
+/*
+ * Various log count values.
+ */
+#define XFS_DEFAULT_LOG_COUNT		1
+#define XFS_DEFAULT_PERM_LOG_COUNT	2
+#define XFS_ITRUNCATE_LOG_COUNT		2
+#define XFS_CREATE_LOG_COUNT		2
+#define XFS_MKDIR_LOG_COUNT		3
+#define XFS_SYMLINK_LOG_COUNT		3
+#define XFS_REMOVE_LOG_COUNT		2
+#define XFS_LINK_LOG_COUNT		2
+#define XFS_RENAME_LOG_COUNT		2
+#define XFS_WRITE_LOG_COUNT		2
+#define XFS_ADDAFORK_LOG_COUNT		2
+#define XFS_ATTRINVAL_LOG_COUNT		1
+#define XFS_ATTRSET_LOG_COUNT		3
+#define XFS_ATTRRM_LOG_COUNT		3
+
+/*
+ * Here we centralize the specification of XFS meta-data buffer
+ * reference count values.  This determine how hard the buffer
+ * cache tries to hold onto the buffer.
+ */
+#define XFS_AGF_REF		4
+#define XFS_AGI_REF		4
+#define XFS_AGFL_REF		3
+#define XFS_INO_BTREE_REF	3
+#define XFS_ALLOC_BTREE_REF	2
+#define XFS_BMAP_BTREE_REF	2
+#define XFS_DIR_BTREE_REF	2
+#define XFS_ATTR_BTREE_REF	1
+#define XFS_INO_REF		1
+#define XFS_DQUOT_REF		1
+
+#ifdef __KERNEL__
+/*
+ * XFS transaction mechanism exported interfaces that are
+ * actually macros.
+ */
+#define xfs_trans_get_log_res(tp)	((tp)->t_log_res)
+#define xfs_trans_get_log_count(tp)	((tp)->t_log_count)
+#define xfs_trans_get_block_res(tp)	((tp)->t_blk_res)
+#define xfs_trans_set_sync(tp)		((tp)->t_flags |= XFS_TRANS_SYNC)
+
+#ifdef DEBUG
+#define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (long)d)
+#define xfs_trans_agflist_delta(tp, d)	((tp)->t_ag_flist_delta += (long)d)
+#define xfs_trans_agbtree_delta(tp, d)	((tp)->t_ag_btree_delta += (long)d)
+#else
+#define xfs_trans_agblocks_delta(tp, d)
+#define xfs_trans_agflist_delta(tp, d)
+#define xfs_trans_agbtree_delta(tp, d)
+#endif
+
+/*
+ * XFS transaction mechanism exported interfaces.
+ */
+void		xfs_trans_init(struct xfs_mount *);
+xfs_trans_t	*xfs_trans_alloc(struct xfs_mount *, uint);
+xfs_trans_t	*_xfs_trans_alloc(struct xfs_mount *, uint);
+xfs_trans_t	*xfs_trans_dup(xfs_trans_t *);
+int		xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
+				  uint, uint);
+void		xfs_trans_callback(xfs_trans_t *,
+				   void (*)(xfs_trans_t *, void *), void *);
+void		xfs_trans_mod_sb(xfs_trans_t *, uint, long);
+struct xfs_buf	*xfs_trans_get_buf(xfs_trans_t *, struct xfs_buftarg *, xfs_daddr_t,
+				   int, uint);
+int		xfs_trans_read_buf(struct xfs_mount *, xfs_trans_t *,
+				   struct xfs_buftarg *, xfs_daddr_t, int, uint,
+				   struct xfs_buf **);
+struct xfs_buf	*xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int);
+
+void		xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_bhold_until_committed(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
+void		xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
+int		xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
+			       xfs_ino_t , uint, struct xfs_inode **);
+void		xfs_trans_iput(xfs_trans_t *, struct xfs_inode *, uint);
+void		xfs_trans_ijoin(xfs_trans_t *, struct xfs_inode *, uint);
+void		xfs_trans_ihold(xfs_trans_t *, struct xfs_inode *);
+void		xfs_trans_ihold_release(xfs_trans_t *, struct xfs_inode *);
+void		xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
+void		xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
+struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint);
+void		xfs_efi_release(struct xfs_efi_log_item *, uint);
+void		xfs_trans_log_efi_extent(xfs_trans_t *,
+					 struct xfs_efi_log_item *,
+					 xfs_fsblock_t,
+					 xfs_extlen_t);
+struct xfs_efd_log_item *xfs_trans_get_efd(xfs_trans_t *,
+				  struct xfs_efi_log_item *,
+				  uint);
+void		xfs_trans_log_efd_extent(xfs_trans_t *,
+					 struct xfs_efd_log_item *,
+					 xfs_fsblock_t,
+					 xfs_extlen_t);
+void		xfs_trans_log_create_rpc(xfs_trans_t *, int, xfs_ino_t);
+void		xfs_trans_log_setattr_rpc(xfs_trans_t *, int);
+int		xfs_trans_commit(xfs_trans_t *, uint flags, xfs_lsn_t *);
+void		xfs_trans_commit_async(struct xfs_mount *);
+void		xfs_trans_cancel(xfs_trans_t *, int);
+void		xfs_trans_ail_init(struct xfs_mount *);
+xfs_lsn_t	xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t);
+xfs_lsn_t	xfs_trans_tail_ail(struct xfs_mount *);
+void		xfs_trans_unlocked_item(struct xfs_mount *,
+					xfs_log_item_t *);
+xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
+					xfs_agnumber_t ag,
+					xfs_extlen_t idx);
+
+/*
+ * Not necessarily exported, but used outside a single file.
+ */
+int		xfs_trans_lsn_danger(struct xfs_mount *, xfs_lsn_t);
+
+#endif	/* __KERNEL__ */
+
+#endif	/* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
new file mode 100644
index 000000000000..ebae51cd352e
--- /dev/null
+++ b/fs/xfs/xfs_trans_ail.c
@@ -0,0 +1,588 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+STATIC void xfs_ail_insert(xfs_ail_entry_t *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_entry_t *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_entry_t *);
+STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_entry_t *, xfs_log_item_t *);
+
+#ifdef XFSDEBUG
+STATIC void xfs_ail_check(xfs_ail_entry_t *);
+#else
+#define xfs_ail_check(a)
+#endif /* XFSDEBUG */
+
+
+/*
+ * This is called by the log manager code to determine the LSN
+ * of the tail of the log.  This is exactly the LSN of the first
+ * item in the AIL.  If the AIL is empty, then this function
+ * returns 0.
+ *
+ * We need the AIL lock in order to get a coherent read of the
+ * lsn of the last item in the AIL.
+ */
+xfs_lsn_t
+xfs_trans_tail_ail(
+	xfs_mount_t	*mp)
+{
+	xfs_lsn_t	lsn;
+	xfs_log_item_t	*lip;
+	SPLDECL(s);
+
+	AIL_LOCK(mp,s);
+	lip = xfs_ail_min(&(mp->m_ail));
+	if (lip == NULL) {
+		lsn = (xfs_lsn_t)0;
+	} else {
+		lsn = lip->li_lsn;
+	}
+	AIL_UNLOCK(mp, s);
+
+	return lsn;
+}
+
+/*
+ * xfs_trans_push_ail
+ *
+ * This routine is called to move the tail of the AIL
+ * forward.  It does this by trying to flush items in the AIL
+ * whose lsns are below the given threshold_lsn.
+ *
+ * The routine returns the lsn of the tail of the log.
+ */
+xfs_lsn_t
+xfs_trans_push_ail(
+	xfs_mount_t		*mp,
+	xfs_lsn_t		threshold_lsn)
+{
+	xfs_lsn_t		lsn;
+	xfs_log_item_t		*lip;
+	int			gen;
+	int			restarts;
+	int			lock_result;
+	int			flush_log;
+	SPLDECL(s);
+
+#define XFS_TRANS_PUSH_AIL_RESTARTS	10
+
+	AIL_LOCK(mp,s);
+	lip = xfs_trans_first_ail(mp, &gen);
+	if (lip == NULL || XFS_FORCED_SHUTDOWN(mp)) {
+		/*
+		 * Just return if the AIL is empty.
+		 */
+		AIL_UNLOCK(mp, s);
+		return (xfs_lsn_t)0;
+	}
+
+	XFS_STATS_INC(xfsstats.xs_push_ail);
+
+	/*
+	 * While the item we are looking at is below the given threshold
+	 * try to flush it out.	 Make sure to limit the number of times
+	 * we allow xfs_trans_next_ail() to restart scanning from the
+	 * beginning of the list.  We'd like not to stop until we've at least
+	 * tried to push on everything in the AIL with an LSN less than
+	 * the given threshold. However, we may give up before that if
+	 * we realize that we've been holding the AIL_LOCK for 'too long',
+	 * blocking interrupts. Currently, too long is < 500us roughly.
+	 */
+	flush_log = 0;
+	restarts = 0;
+	while (((restarts < XFS_TRANS_PUSH_AIL_RESTARTS) &&
+		(XFS_LSN_CMP(lip->li_lsn, threshold_lsn) < 0))) {
+		/*
+		 * If we can lock the item without sleeping, unlock
+		 * the AIL lock and flush the item.  Then re-grab the
+		 * AIL lock so we can look for the next item on the
+		 * AIL.	 Since we unlock the AIL while we flush the
+		 * item, the next routine may start over again at the
+		 * the beginning of the list if anything has changed.
+		 * That is what the generation count is for.
+		 *
+		 * If we can't lock the item, either its holder will flush
+		 * it or it is already being flushed or it is being relogged.
+		 * In any of these case it is being taken care of and we
+		 * can just skip to the next item in the list.
+		 */
+		lock_result = IOP_TRYLOCK(lip);
+		switch (lock_result) {
+		      case XFS_ITEM_SUCCESS:
+			AIL_UNLOCK(mp, s);
+			XFS_STATS_INC(xfsstats.xs_push_ail_success);
+			IOP_PUSH(lip);
+			AIL_LOCK(mp,s);
+			break;
+
+		      case XFS_ITEM_PUSHBUF:
+			AIL_UNLOCK(mp, s);
+			XFS_STATS_INC(xfsstats.xs_push_ail_pushbuf);
+#ifdef XFSRACEDEBUG
+			delay_for_intr();
+			delay(300);
+#endif
+			ASSERT(lip->li_ops->iop_pushbuf);
+			ASSERT(lip);
+			IOP_PUSHBUF(lip);
+			AIL_LOCK(mp,s);
+			break;
+
+		      case XFS_ITEM_PINNED:
+			XFS_STATS_INC(xfsstats.xs_push_ail_pinned);
+			flush_log = 1;
+			break;
+
+		      case XFS_ITEM_LOCKED:
+			XFS_STATS_INC(xfsstats.xs_push_ail_locked);
+			break;
+
+		      case XFS_ITEM_FLUSHING:
+			XFS_STATS_INC(xfsstats.xs_push_ail_flushing);
+			break;
+
+		      default:
+			ASSERT(0);
+			break;
+		}
+
+		lip = xfs_trans_next_ail(mp, lip, &gen, &restarts);
+		if (lip == NULL) {
+			break;
+		}
+		if (XFS_FORCED_SHUTDOWN(mp)) {
+			/*
+			 * Just return if we shut down during the last try.
+			 */
+			AIL_UNLOCK(mp, s);
+			return (xfs_lsn_t)0;
+		}
+
+	}
+
+	if (flush_log) {
+		/*
+		 * If something we need to push out was pinned, then
+		 * push out the log so it will become unpinned and
+		 * move forward in the AIL.
+		 */
+		AIL_UNLOCK(mp, s);
+		XFS_STATS_INC(xfsstats.xs_push_ail_flush);
+		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+		AIL_LOCK(mp, s);
+	}
+
+	lip = xfs_ail_min(&(mp->m_ail));
+	if (lip == NULL) {
+		lsn = (xfs_lsn_t)0;
+	} else {
+		lsn = lip->li_lsn;
+	}
+
+	AIL_UNLOCK(mp, s);
+	return lsn;
+}	/* xfs_trans_push_ail */
+
+
+/*
+ * This is to be called when an item is unlocked that may have
+ * been in the AIL.  It will wake up the first member of the AIL
+ * wait list if this item's unlocking might allow it to progress.
+ * If the item is in the AIL, then we need to get the AIL lock
+ * while doing our checking so we don't race with someone going
+ * to sleep waiting for this event in xfs_trans_push_ail().
+ */
+void
+xfs_trans_unlocked_item(
+	xfs_mount_t	*mp,
+	xfs_log_item_t	*lip)
+{
+	xfs_log_item_t	*min_lip;
+
+	/*
+	 * If we're forcibly shutting down, we may have
+	 * unlocked log items arbitrarily. The last thing
+	 * we want to do is to move the tail of the log
+	 * over some potentially valid data.
+	 */
+	if (!(lip->li_flags & XFS_LI_IN_AIL) ||
+	    XFS_FORCED_SHUTDOWN(mp)) {
+		return;
+	}
+
+	/*
+	 * This is the one case where we can call into xfs_ail_min()
+	 * without holding the AIL lock because we only care about the
+	 * case where we are at the tail of the AIL.  If the object isn't
+	 * at the tail, it doesn't matter what result we get back.  This
+	 * is slightly racy because since we were just unlocked, we could
+	 * go to sleep between the call to xfs_ail_min and the call to
+	 * xfs_log_move_tail, have someone else lock us, commit to us disk,
+	 * move us out of the tail of the AIL, and then we wake up.  However,
+	 * the call to xfs_log_move_tail() doesn't do anything if there's
+	 * not enough free space to wake people up so we're safe calling it.
+	 */
+	min_lip = xfs_ail_min(&mp->m_ail);
+
+	if (min_lip == lip)
+		xfs_log_move_tail(mp, 1);
+}	/* xfs_trans_unlocked_item */
+
+
+/*
+ * Update the position of the item in the AIL with the new
+ * lsn.	 If it is not yet in the AIL, add it.  Otherwise, move
+ * it to its new position by removing it and re-adding it.
+ *
+ * Wakeup anyone with an lsn less than the item's lsn.	If the item
+ * we move in the AIL is the minimum one, update the tail lsn in the
+ * log manager.
+ *
+ * Increment the AIL's generation count to indicate that the tree
+ * has changed.
+ *
+ * This function must be called with the AIL lock held.	 The lock
+ * is dropped before returning, so the caller must pass in the
+ * cookie returned by AIL_LOCK.
+ */
+void
+xfs_trans_update_ail(
+	xfs_mount_t	*mp,
+	xfs_log_item_t	*lip,
+	xfs_lsn_t	lsn,
+	unsigned long	s)
+{
+	xfs_ail_entry_t		*ailp;
+	xfs_log_item_t		*dlip=NULL;
+	xfs_log_item_t		*mlip;	/* ptr to minimum lip */
+
+	ailp = &(mp->m_ail);
+	mlip = xfs_ail_min(ailp);
+
+	if (lip->li_flags & XFS_LI_IN_AIL) {
+		dlip = xfs_ail_delete(ailp, lip);
+		ASSERT(dlip == lip);
+	} else {
+		lip->li_flags |= XFS_LI_IN_AIL;
+	}
+
+	lip->li_lsn = lsn;
+
+	xfs_ail_insert(ailp, lip);
+	mp->m_ail_gen++;
+
+	if (mlip == dlip) {
+		mlip = xfs_ail_min(&(mp->m_ail));
+		AIL_UNLOCK(mp, s);
+		xfs_log_move_tail(mp, mlip->li_lsn);
+	} else {
+		AIL_UNLOCK(mp, s);
+	}
+
+
+}	/* xfs_trans_update_ail */
+
+/*
+ * Delete the given item from the AIL.	It must already be in
+ * the AIL.
+ *
+ * Wakeup anyone with an lsn less than item's lsn.    If the item
+ * we delete in the AIL is the minimum one, update the tail lsn in the
+ * log manager.
+ *
+ * Clear the IN_AIL flag from the item, reset its lsn to 0, and
+ * bump the AIL's generation count to indicate that the tree
+ * has changed.
+ *
+ * This function must be called with the AIL lock held.	 The lock
+ * is dropped before returning, so the caller must pass in the
+ * cookie returned by AIL_LOCK.
+ */
+void
+xfs_trans_delete_ail(
+	xfs_mount_t	*mp,
+	xfs_log_item_t	*lip,
+	unsigned long	s)
+{
+	xfs_ail_entry_t		*ailp;
+	xfs_log_item_t		*dlip;
+	xfs_log_item_t		*mlip;
+
+	if (lip->li_flags & XFS_LI_IN_AIL) {
+		ailp = &(mp->m_ail);
+		mlip = xfs_ail_min(ailp);
+		dlip = xfs_ail_delete(ailp, lip);
+		ASSERT(dlip == lip);
+
+
+		lip->li_flags &= ~XFS_LI_IN_AIL;
+		lip->li_lsn = 0;
+		mp->m_ail_gen++;
+
+		if (mlip == dlip) {
+			mlip = xfs_ail_min(&(mp->m_ail));
+			AIL_UNLOCK(mp, s);
+			xfs_log_move_tail(mp, (mlip ? mlip->li_lsn : 0));
+		} else {
+			AIL_UNLOCK(mp, s);
+		}
+	}
+	else {
+		/*
+		 * If the file system is not being shutdown, we are in
+		 * serious trouble if we get to this stage.
+		 */
+		if (XFS_FORCED_SHUTDOWN(mp))
+			AIL_UNLOCK(mp, s);
+		else {
+			xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
+				"xfs_trans_delete_ail: attempting to delete a log item that is not in the AIL");
+			xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
+			AIL_UNLOCK(mp, s);
+		}
+	}
+}
+
+
+
+/*
+ * Return the item in the AIL with the smallest lsn.
+ * Return the current tree generation number for use
+ * in calls to xfs_trans_next_ail().
+ */
+xfs_log_item_t *
+xfs_trans_first_ail(
+	xfs_mount_t	*mp,
+	int		*gen)
+{
+	xfs_log_item_t	*lip;
+
+	lip = xfs_ail_min(&(mp->m_ail));
+	*gen = (int)mp->m_ail_gen;
+
+	return (lip);
+}
+
+/*
+ * If the generation count of the tree has not changed since the
+ * caller last took something from the AIL, then return the elmt
+ * in the tree which follows the one given.  If the count has changed,
+ * then return the minimum elmt of the AIL and bump the restarts counter
+ * if one is given.
+ */
+xfs_log_item_t *
+xfs_trans_next_ail(
+	xfs_mount_t	*mp,
+	xfs_log_item_t	*lip,
+	int		*gen,
+	int		*restarts)
+{
+	xfs_log_item_t	*nlip;
+
+	ASSERT(mp && lip && gen);
+	if (mp->m_ail_gen == *gen) {
+		nlip = xfs_ail_next(&(mp->m_ail), lip);
+	} else {
+		nlip = xfs_ail_min(&(mp->m_ail));
+		*gen = (int)mp->m_ail_gen;
+		if (restarts != NULL) {
+			XFS_STATS_INC(xfsstats.xs_push_ail_restarts);
+			(*restarts)++;
+		}
+	}
+
+	return (nlip);
+}
+
+
+/*
+ * The active item list (AIL) is a doubly linked list of log
+ * items sorted by ascending lsn.  The base of the list is
+ * a forw/back pointer pair embedded in the xfs mount structure.
+ * The base is initialized with both pointers pointing to the
+ * base.  This case always needs to be distinguished, because
+ * the base has no lsn to look at.  We almost always insert
+ * at the end of the list, so on inserts we search from the
+ * end of the list to find where the new item belongs.
+ */
+
+/*
+ * Initialize the doubly linked list to point only to itself.
+ */
+void
+xfs_trans_ail_init(
+	xfs_mount_t	*mp)
+{
+	mp->m_ail.ail_forw = (xfs_log_item_t*)&(mp->m_ail);
+	mp->m_ail.ail_back = (xfs_log_item_t*)&(mp->m_ail);
+}
+
+/*
+ * Insert the given log item into the AIL.
+ * We almost always insert at the end of the list, so on inserts
+ * we search from the end of the list to find where the
+ * new item belongs.
+ */
+STATIC void
+xfs_ail_insert(
+	xfs_ail_entry_t *base,
+	xfs_log_item_t	*lip)
+/* ARGSUSED */
+{
+	xfs_log_item_t	*next_lip;
+
+	/*
+	 * If the list is empty, just insert the item.
+	 */
+	if (base->ail_back == (xfs_log_item_t*)base) {
+		base->ail_forw = lip;
+		base->ail_back = lip;
+		lip->li_ail.ail_forw = (xfs_log_item_t*)base;
+		lip->li_ail.ail_back = (xfs_log_item_t*)base;
+		return;
+	}
+
+	next_lip = base->ail_back;
+	while ((next_lip != (xfs_log_item_t*)base) &&
+	       (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) > 0)) {
+		next_lip = next_lip->li_ail.ail_back;
+	}
+	ASSERT((next_lip == (xfs_log_item_t*)base) ||
+	       (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
+	lip->li_ail.ail_forw = next_lip->li_ail.ail_forw;
+	lip->li_ail.ail_back = next_lip;
+	next_lip->li_ail.ail_forw = lip;
+	lip->li_ail.ail_forw->li_ail.ail_back = lip;
+
+	xfs_ail_check(base);
+	return;
+}
+
+/*
+ * Delete the given item from the AIL.	Return a pointer to the item.
+ */
+/*ARGSUSED*/
+STATIC xfs_log_item_t *
+xfs_ail_delete(
+	xfs_ail_entry_t *base,
+	xfs_log_item_t	*lip)
+/* ARGSUSED */
+{
+	lip->li_ail.ail_forw->li_ail.ail_back = lip->li_ail.ail_back;
+	lip->li_ail.ail_back->li_ail.ail_forw = lip->li_ail.ail_forw;
+	lip->li_ail.ail_forw = NULL;
+	lip->li_ail.ail_back = NULL;
+
+	xfs_ail_check(base);
+	return lip;
+}
+
+/*
+ * Return a pointer to the first item in the AIL.
+ * If the AIL is empty, then return NULL.
+ */
+STATIC xfs_log_item_t *
+xfs_ail_min(
+	xfs_ail_entry_t *base)
+/* ARGSUSED */
+{
+	register xfs_log_item_t *forw = base->ail_forw;
+	if (forw == (xfs_log_item_t*)base) {
+		return NULL;
+	}
+	return forw;
+}
+
+/*
+ * Return a pointer to the item which follows
+ * the given item in the AIL.  If the given item
+ * is the last item in the list, then return NULL.
+ */
+STATIC xfs_log_item_t *
+xfs_ail_next(
+	xfs_ail_entry_t *base,
+	xfs_log_item_t	*lip)
+/* ARGSUSED */
+{
+	if (lip->li_ail.ail_forw == (xfs_log_item_t*)base) {
+		return NULL;
+	}
+	return lip->li_ail.ail_forw;
+
+}
+
+#ifdef XFSDEBUG
+/*
+ * Check that the list is sorted as it should be.
+ */
+STATIC void
+xfs_ail_check(
+	xfs_ail_entry_t *base)
+{
+	xfs_log_item_t	*lip;
+	xfs_log_item_t	*prev_lip;
+
+	lip = base->ail_forw;
+	if (lip == (xfs_log_item_t*)base) {
+		/*
+		 * Make sure the pointers are correct when the list
+		 * is empty.
+		 */
+		ASSERT(base->ail_back == (xfs_log_item_t*)base);
+		return;
+	}
+
+	/*
+	 * Walk the list checking forward and backward pointers,
+	 * lsn ordering, and that every entry has the XFS_LI_IN_AIL
+	 * flag set.
+	 */
+	prev_lip = (xfs_log_item_t*)base;
+	while (lip != (xfs_log_item_t*)base) {
+		if (prev_lip != (xfs_log_item_t*)base) {
+			ASSERT(prev_lip->li_ail.ail_forw == lip);
+			ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
+		}
+		ASSERT(lip->li_ail.ail_back == prev_lip);
+		ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
+		prev_lip = lip;
+		lip = lip->li_ail.ail_forw;
+	}
+	ASSERT(lip == (xfs_log_item_t*)base);
+	ASSERT(base->ail_back == prev_lip);
+}
+#endif /* XFSDEBUG */
+
+
+
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
new file mode 100644
index 000000000000..e6bcf374e54c
--- /dev/null
+++ b/fs/xfs/xfs_trans_buf.c
@@ -0,0 +1,1085 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *,
+		xfs_daddr_t, int);
+STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *,
+		xfs_daddr_t, int);
+
+
+/*
+ * Get and lock the buffer for the caller if it is not already
+ * locked within the given transaction.	 If it is already locked
+ * within the transaction, just increment its lock recursion count
+ * and return a pointer to it.
+ *
+ * Use the fast path function xfs_trans_buf_item_match() or the buffer
+ * cache routine incore_match() to find the buffer
+ * if it is already owned by this transaction.
+ *
+ * If we don't already own the buffer, use get_buf() to get it.
+ * If it doesn't yet have an associated xfs_buf_log_item structure,
+ * then allocate one and add the item to this transaction.
+ *
+ * If the transaction pointer is NULL, make this just a normal
+ * get_buf() call.
+ */
+xfs_buf_t *
+xfs_trans_get_buf(xfs_trans_t	*tp,
+		  xfs_buftarg_t	*target_dev,
+		  xfs_daddr_t	blkno,
+		  int		len,
+		  uint		flags)
+{
+	xfs_buf_t		*bp;
+	xfs_buf_log_item_t	*bip;
+
+	if (flags == 0)
+		flags = XFS_BUF_LOCK | XFS_BUF_MAPPED;
+
+	/*
+	 * Default to a normal get_buf() call if the tp is NULL.
+	 */
+	if (tp == NULL) {
+		bp = xfs_buf_get_flags(target_dev, blkno, len,
+							flags | BUF_BUSY);
+		return(bp);
+	}
+
+	/*
+	 * If we find the buffer in the cache with this transaction
+	 * pointer in its b_fsprivate2 field, then we know we already
+	 * have it locked.  In this case we just increment the lock
+	 * recursion count and return the buffer to the caller.
+	 */
+	if (tp->t_items.lic_next == NULL) {
+		bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len);
+	} else {
+		bp  = xfs_trans_buf_item_match_all(tp, target_dev, blkno, len);
+	}
+	if (bp != NULL) {
+		ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+		if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) {
+			xfs_buftrace("TRANS GET RECUR SHUT", bp);
+			XFS_BUF_SUPER_STALE(bp);
+		}
+		/*
+		 * If the buffer is stale then it was binval'ed
+		 * since last read.  This doesn't matter since the
+		 * caller isn't allowed to use the data anyway.
+		 */
+		else if (XFS_BUF_ISSTALE(bp)) {
+			xfs_buftrace("TRANS GET RECUR STALE", bp);
+			ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
+		}
+		ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+		bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+		ASSERT(bip != NULL);
+		ASSERT(atomic_read(&bip->bli_refcount) > 0);
+		bip->bli_recur++;
+		xfs_buftrace("TRANS GET RECUR", bp);
+		xfs_buf_item_trace("GET RECUR", bip);
+		return (bp);
+	}
+
+	/*
+	 * We always specify the BUF_BUSY flag within a transaction so
+	 * that get_buf does not try to push out a delayed write buffer
+	 * which might cause another transaction to take place (if the
+	 * buffer was delayed alloc).  Such recursive transactions can
+	 * easily deadlock with our current transaction as well as cause
+	 * us to run out of stack space.
+	 */
+	bp = xfs_buf_get_flags(target_dev, blkno, len, flags | BUF_BUSY);
+	if (bp == NULL) {
+		return NULL;
+	}
+
+	ASSERT(!XFS_BUF_GETERROR(bp));
+
+	/*
+	 * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
+	 * it doesn't have one yet, then allocate one and initialize it.
+	 * The checks to see if one is there are in xfs_buf_item_init().
+	 */
+	xfs_buf_item_init(bp, tp->t_mountp);
+
+	/*
+	 * Set the recursion count for the buffer within this transaction
+	 * to 0.
+	 */
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+	bip->bli_recur = 0;
+
+	/*
+	 * Take a reference for this transaction on the buf item.
+	 */
+	atomic_inc(&bip->bli_refcount);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
+
+	/*
+	 * Initialize b_fsprivate2 so we can find it with incore_match()
+	 * above.
+	 */
+	XFS_BUF_SET_FSPRIVATE2(bp, tp);
+
+	xfs_buftrace("TRANS GET", bp);
+	xfs_buf_item_trace("GET", bip);
+	return (bp);
+}
+
+/*
+ * Get and lock the superblock buffer of this file system for the
+ * given transaction.
+ *
+ * We don't need to use incore_match() here, because the superblock
+ * buffer is a private buffer which we keep a pointer to in the
+ * mount structure.
+ */
+xfs_buf_t *
+xfs_trans_getsb(xfs_trans_t	*tp,
+		struct xfs_mount *mp,
+		int		flags)
+{
+	xfs_buf_t		*bp;
+	xfs_buf_log_item_t	*bip;
+
+	/*
+	 * Default to just trying to lock the superblock buffer
+	 * if tp is NULL.
+	 */
+	if (tp == NULL) {
+		return (xfs_getsb(mp, flags));
+	}
+
+	/*
+	 * If the superblock buffer already has this transaction
+	 * pointer in its b_fsprivate2 field, then we know we already
+	 * have it locked.  In this case we just increment the lock
+	 * recursion count and return the buffer to the caller.
+	 */
+	bp = mp->m_sb_bp;
+	if (XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp) {
+		bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+		ASSERT(bip != NULL);
+		ASSERT(atomic_read(&bip->bli_refcount) > 0);
+		bip->bli_recur++;
+		xfs_buf_item_trace("GETSB RECUR", bip);
+		return (bp);
+	}
+
+	bp = xfs_getsb(mp, flags);
+	if (bp == NULL) {
+		return NULL;
+	}
+
+	/*
+	 * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
+	 * it doesn't have one yet, then allocate one and initialize it.
+	 * The checks to see if one is there are in xfs_buf_item_init().
+	 */
+	xfs_buf_item_init(bp, mp);
+
+	/*
+	 * Set the recursion count for the buffer within this transaction
+	 * to 0.
+	 */
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+	bip->bli_recur = 0;
+
+	/*
+	 * Take a reference for this transaction on the buf item.
+	 */
+	atomic_inc(&bip->bli_refcount);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
+
+	/*
+	 * Initialize b_fsprivate2 so we can find it with incore_match()
+	 * above.
+	 */
+	XFS_BUF_SET_FSPRIVATE2(bp, tp);
+
+	xfs_buf_item_trace("GETSB", bip);
+	return (bp);
+}
+
+#ifdef DEBUG
+dev_t	xfs_error_dev = 0;
+int	xfs_do_error;
+int	xfs_req_num;
+int	xfs_error_mod = 33;
+#endif
+
+/*
+ * Get and lock the buffer for the caller if it is not already
+ * locked within the given transaction.	 If it has not yet been
+ * read in, read it from disk. If it is already locked
+ * within the transaction and already read in, just increment its
+ * lock recursion count and return a pointer to it.
+ *
+ * Use the fast path function xfs_trans_buf_item_match() or the buffer
+ * cache routine incore_match() to find the buffer
+ * if it is already owned by this transaction.
+ *
+ * If we don't already own the buffer, use read_buf() to get it.
+ * If it doesn't yet have an associated xfs_buf_log_item structure,
+ * then allocate one and add the item to this transaction.
+ *
+ * If the transaction pointer is NULL, make this just a normal
+ * read_buf() call.
+ */
+int
+xfs_trans_read_buf(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_buftarg_t	*target,
+	xfs_daddr_t	blkno,
+	int		len,
+	uint		flags,
+	xfs_buf_t	**bpp)
+{
+	xfs_buf_t		*bp;
+	xfs_buf_log_item_t	*bip;
+	int			error;
+
+	if (flags == 0)
+		flags = XFS_BUF_LOCK | XFS_BUF_MAPPED;
+
+	/*
+	 * Default to a normal get_buf() call if the tp is NULL.
+	 */
+	if (tp == NULL) {
+		bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY);
+		if (!bp)
+			return XFS_ERROR(ENOMEM);
+
+		if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) {
+			xfs_ioerror_alert("xfs_trans_read_buf", mp,
+					  bp, blkno);
+			error = XFS_BUF_GETERROR(bp);
+			xfs_buf_relse(bp);
+			return error;
+		}
+#ifdef DEBUG
+		if (xfs_do_error && (bp != NULL)) {
+			if (xfs_error_dev == target->pbr_dev) {
+				if (((xfs_req_num++) % xfs_error_mod) == 0) {
+					xfs_buf_relse(bp);
+					printk("Returning error!\n");
+					return XFS_ERROR(EIO);
+				}
+			}
+		}
+#endif
+		if (XFS_FORCED_SHUTDOWN(mp))
+			goto shutdown_abort;
+		*bpp = bp;
+		return 0;
+	}
+
+	/*
+	 * If we find the buffer in the cache with this transaction
+	 * pointer in its b_fsprivate2 field, then we know we already
+	 * have it locked.  If it is already read in we just increment
+	 * the lock recursion count and return the buffer to the caller.
+	 * If the buffer is not yet read in, then we read it in, increment
+	 * the lock recursion count, and return it to the caller.
+	 */
+	if (tp->t_items.lic_next == NULL) {
+		bp = xfs_trans_buf_item_match(tp, target, blkno, len);
+	} else {
+		bp = xfs_trans_buf_item_match_all(tp, target, blkno, len);
+	}
+	if (bp != NULL) {
+		ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
+		ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+		ASSERT((XFS_BUF_ISERROR(bp)) == 0);
+		if (!(XFS_BUF_ISDONE(bp))) {
+			xfs_buftrace("READ_BUF_INCORE !DONE", bp);
+			ASSERT(!XFS_BUF_ISASYNC(bp));
+			XFS_BUF_READ(bp);
+			xfsbdstrat(tp->t_mountp, bp);
+			xfs_iowait(bp);
+			if (XFS_BUF_GETERROR(bp) != 0) {
+				xfs_ioerror_alert("xfs_trans_read_buf", mp,
+						  bp, blkno);
+				error = XFS_BUF_GETERROR(bp);
+				xfs_buf_relse(bp);
+				/*
+				 * We can gracefully recover from most
+				 * read errors. Ones we can't are those
+				 * that happen after the transaction's
+				 * already dirty.
+				 */
+				if (tp->t_flags & XFS_TRANS_DIRTY)
+					xfs_force_shutdown(tp->t_mountp,
+							   XFS_METADATA_IO_ERROR);
+				return error;
+			}
+		}
+		/*
+		 * We never locked this buf ourselves, so we shouldn't
+		 * brelse it either. Just get out.
+		 */
+		if (XFS_FORCED_SHUTDOWN(mp)) {
+			xfs_buftrace("READ_BUF_INCORE XFSSHUTDN", bp);
+			*bpp = NULL;
+			return XFS_ERROR(EIO);
+		}
+
+
+		bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+		bip->bli_recur++;
+
+		ASSERT(atomic_read(&bip->bli_refcount) > 0);
+		xfs_buf_item_trace("READ RECUR", bip);
+		*bpp = bp;
+		return 0;
+	}
+
+	/*
+	 * We always specify the BUF_BUSY flag within a transaction so
+	 * that get_buf does not try to push out a delayed write buffer
+	 * which might cause another transaction to take place (if the
+	 * buffer was delayed alloc).  Such recursive transactions can
+	 * easily deadlock with our current transaction as well as cause
+	 * us to run out of stack space.
+	 */
+	bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY);
+	if (bp == NULL) {
+		*bpp = NULL;
+		return 0;
+	}
+	if (XFS_BUF_GETERROR(bp) != 0) {
+	    XFS_BUF_SUPER_STALE(bp);
+		xfs_buftrace("READ ERROR", bp);
+		error = XFS_BUF_GETERROR(bp);
+
+		xfs_ioerror_alert("xfs_trans_read_buf", mp,
+				  bp, blkno);
+		if (tp->t_flags & XFS_TRANS_DIRTY)
+			xfs_force_shutdown(tp->t_mountp, XFS_METADATA_IO_ERROR);
+		xfs_buf_relse(bp);
+		return error;
+	}
+#ifdef DEBUG
+	if (xfs_do_error && !(tp->t_flags & XFS_TRANS_DIRTY)) {
+		if (xfs_error_dev == target->pbr_dev) {
+			if (((xfs_req_num++) % xfs_error_mod) == 0) {
+				xfs_force_shutdown(tp->t_mountp,
+						   XFS_METADATA_IO_ERROR);
+				xfs_buf_relse(bp);
+				printk("Returning error in trans!\n");
+				return XFS_ERROR(EIO);
+			}
+		}
+	}
+#endif
+	if (XFS_FORCED_SHUTDOWN(mp))
+		goto shutdown_abort;
+
+	/*
+	 * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
+	 * it doesn't have one yet, then allocate one and initialize it.
+	 * The checks to see if one is there are in xfs_buf_item_init().
+	 */
+	xfs_buf_item_init(bp, tp->t_mountp);
+
+	/*
+	 * Set the recursion count for the buffer within this transaction
+	 * to 0.
+	 */
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+	bip->bli_recur = 0;
+
+	/*
+	 * Take a reference for this transaction on the buf item.
+	 */
+	atomic_inc(&bip->bli_refcount);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
+
+	/*
+	 * Initialize b_fsprivate2 so we can find it with incore_match()
+	 * above.
+	 */
+	XFS_BUF_SET_FSPRIVATE2(bp, tp);
+
+	xfs_buftrace("TRANS READ", bp);
+	xfs_buf_item_trace("READ", bip);
+	*bpp = bp;
+	return 0;
+
+shutdown_abort:
+	/*
+	 * the theory here is that buffer is good but we're
+	 * bailing out because the filesystem is being forcibly
+	 * shut down.  So we should leave the b_flags alone since
+	 * the buffer's not staled and just get out.
+	 */
+#if defined(DEBUG)
+	if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
+		cmn_err(CE_NOTE, "about to pop assert, bp == 0x%x\n", bp);
+#endif
+	ASSERT((XFS_BUF_BFLAGS(bp) & (XFS_B_STALE|XFS_B_DELWRI)) !=
+						(XFS_B_STALE|XFS_B_DELWRI));
+
+	xfs_buftrace("READ_BUF XFSSHUTDN", bp);
+	xfs_buf_relse(bp);
+	*bpp = NULL;
+	return XFS_ERROR(EIO);
+}
+
+
+/*
+ * Release the buffer bp which was previously acquired with one of the
+ * xfs_trans_... buffer allocation routines if the buffer has not
+ * been modified within this transaction.  If the buffer is modified
+ * within this transaction, do decrement the recursion count but do
+ * not release the buffer even if the count goes to 0.	If the buffer is not
+ * modified within the transaction, decrement the recursion count and
+ * release the buffer if the recursion count goes to 0.
+ *
+ * If the buffer is to be released and it was not modified before
+ * this transaction began, then free the buf_log_item associated with it.
+ *
+ * If the transaction pointer is NULL, make this just a normal
+ * brelse() call.
+ */
+void
+xfs_trans_brelse(xfs_trans_t	*tp,
+		 xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip;
+	xfs_log_item_t		*lip;
+	xfs_log_item_desc_t	*lidp;
+
+	/*
+	 * Default to a normal brelse() call if the tp is NULL.
+	 */
+	if (tp == NULL) {
+		ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
+		/*
+		 * If there's a buf log item attached to the buffer,
+		 * then let the AIL know that the buffer is being
+		 * unlocked.
+		 */
+		if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
+			lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+			if (lip->li_type == XFS_LI_BUF) {
+				bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
+				xfs_trans_unlocked_item(
+						bip->bli_item.li_mountp,
+						lip);
+			}
+		}
+		xfs_buf_relse(bp);
+		return;
+	}
+
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	/*
+	 * Find the item descriptor pointing to this buffer's
+	 * log item.  It must be there.
+	 */
+	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
+	ASSERT(lidp != NULL);
+
+	/*
+	 * If the release is just for a recursive lock,
+	 * then decrement the count and return.
+	 */
+	if (bip->bli_recur > 0) {
+		bip->bli_recur--;
+		xfs_buf_item_trace("RELSE RECUR", bip);
+		return;
+	}
+
+	/*
+	 * If the buffer is dirty within this transaction, we can't
+	 * release it until we commit.
+	 */
+	if (lidp->lid_flags & XFS_LID_DIRTY) {
+		xfs_buf_item_trace("RELSE DIRTY", bip);
+		return;
+	}
+
+	/*
+	 * If the buffer has been invalidated, then we can't release
+	 * it until the transaction commits to disk unless it is re-dirtied
+	 * as part of this transaction.	 This prevents us from pulling
+	 * the item from the AIL before we should.
+	 */
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		xfs_buf_item_trace("RELSE STALE", bip);
+		return;
+	}
+
+	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+	xfs_buf_item_trace("RELSE", bip);
+
+	/*
+	 * Free up the log item descriptor tracking the released item.
+	 */
+	xfs_trans_free_item(tp, lidp);
+
+	/*
+	 * Clear the hold flag in the buf log item if it is set.
+	 * We wouldn't want the next user of the buffer to
+	 * get confused.
+	 */
+	if (bip->bli_flags & XFS_BLI_HOLD) {
+		bip->bli_flags &= ~XFS_BLI_HOLD;
+	}
+
+	/*
+	 * Drop our reference to the buf log item.
+	 */
+	atomic_dec(&bip->bli_refcount);
+
+	/*
+	 * If the buf item is not tracking data in the log, then
+	 * we must free it before releasing the buffer back to the
+	 * free pool.  Before releasing the buffer to the free pool,
+	 * clear the transaction pointer in b_fsprivate2 to dissolve
+	 * its relation to this transaction.
+	 */
+	if (!xfs_buf_item_dirty(bip)) {
+/***
+		ASSERT(bp->b_pincount == 0);
+***/
+		ASSERT(atomic_read(&bip->bli_refcount) == 0);
+		ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
+		ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF));
+		xfs_buf_item_relse(bp);
+		bip = NULL;
+	}
+	XFS_BUF_SET_FSPRIVATE2(bp, NULL);
+
+	/*
+	 * If we've still got a buf log item on the buffer, then
+	 * tell the AIL that the buffer is being unlocked.
+	 */
+	if (bip != NULL) {
+		xfs_trans_unlocked_item(bip->bli_item.li_mountp,
+					(xfs_log_item_t*)bip);
+	}
+
+	xfs_buf_relse(bp);
+	return;
+}
+
+/*
+ * Add the locked buffer to the transaction.
+ * The buffer must be locked, and it cannot be associated with any
+ * transaction.
+ *
+ * If the buffer does not yet have a buf log item associated with it,
+ * then allocate one for it.  Then add the buf item to the transaction.
+ */
+void
+xfs_trans_bjoin(xfs_trans_t	*tp,
+		xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
+
+	/*
+	 * The xfs_buf_log_item pointer is stored in b_fsprivate.  If
+	 * it doesn't have one yet, then allocate one and initialize it.
+	 * The checks to see if one is there are in xfs_buf_item_init().
+	 */
+	xfs_buf_item_init(bp, tp->t_mountp);
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+
+	/*
+	 * Take a reference for this transaction on the buf item.
+	 */
+	atomic_inc(&bip->bli_refcount);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	(void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
+
+	/*
+	 * Initialize b_fsprivate2 so we can find it with incore_match()
+	 * in xfs_trans_get_buf() and friends above.
+	 */
+	XFS_BUF_SET_FSPRIVATE2(bp, tp);
+
+	xfs_buf_item_trace("BJOIN", bip);
+}
+
+/*
+ * Mark the buffer as not needing to be unlocked when the buf item's
+ * IOP_UNLOCK() routine is called.  The buffer must already be locked
+ * and associated with the given transaction.
+ */
+/* ARGSUSED */
+void
+xfs_trans_bhold(xfs_trans_t	*tp,
+		xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	bip->bli_flags |= XFS_BLI_HOLD;
+	xfs_buf_item_trace("BHOLD", bip);
+}
+
+/*
+ * This function is used to indicate that the buffer should not be
+ * unlocked until the transaction is committed to disk.	 Since we
+ * are going to keep the lock held, make the transaction synchronous
+ * so that the lock is not held too long.
+ *
+ * It uses the log item descriptor flag XFS_LID_SYNC_UNLOCK to
+ * delay the buf items's unlock call until the transaction is
+ * committed to disk or aborted.
+ */
+void
+xfs_trans_bhold_until_committed(xfs_trans_t	*tp,
+				xfs_buf_t	*bp)
+{
+	xfs_log_item_desc_t	*lidp;
+	xfs_buf_log_item_t	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+	ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
+	ASSERT(lidp != NULL);
+
+	lidp->lid_flags |= XFS_LID_SYNC_UNLOCK;
+	xfs_buf_item_trace("BHOLD UNTIL COMMIT", bip);
+
+	xfs_trans_set_sync(tp);
+}
+
+/*
+ * This is called to mark bytes first through last inclusive of the given
+ * buffer as needing to be logged when the transaction is committed.
+ * The buffer must already be associated with the given transaction.
+ *
+ * First and last are numbers relative to the beginning of this buffer,
+ * so the first byte in the buffer is numbered 0 regardless of the
+ * value of b_blkno.
+ */
+void
+xfs_trans_log_buf(xfs_trans_t	*tp,
+		  xfs_buf_t	*bp,
+		  uint		first,
+		  uint		last)
+{
+	xfs_buf_log_item_t	*bip;
+	xfs_log_item_desc_t	*lidp;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+	ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp)));
+	ASSERT((XFS_BUF_IODONE_FUNC(bp) == NULL) ||
+	       (XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks));
+
+	/*
+	 * Mark the buffer as needing to be written out eventually,
+	 * and set its iodone function to remove the buffer's buf log
+	 * item from the AIL and free it when the buffer is flushed
+	 * to disk.  See xfs_buf_attach_iodone() for more details
+	 * on li_cb and xfs_buf_iodone_callbacks().
+	 * If we end up aborting this transaction, we trap this buffer
+	 * inside the b_bdstrat callback so that this won't get written to
+	 * disk.
+	 */
+	XFS_BUF_DELAYWRITE(bp);
+	XFS_BUF_DONE(bp);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
+	bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))xfs_buf_iodone;
+
+	/*
+	 * If we invalidated the buffer within this transaction, then
+	 * cancel the invalidation now that we're dirtying the buffer
+	 * again.  There are no races with the code in xfs_buf_item_unpin(),
+	 * because we have a reference to the buffer this entire time.
+	 */
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		xfs_buf_item_trace("BLOG UNSTALE", bip);
+		bip->bli_flags &= ~XFS_BLI_STALE;
+		/* note this will have to change for page_buf interface... unstale isn't really an option RMC */
+		ASSERT(XFS_BUF_ISSTALE(bp));
+		XFS_BUF_UNSTALE(bp);
+		bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL;
+	}
+
+	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
+	ASSERT(lidp != NULL);
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	lidp->lid_flags |= XFS_LID_DIRTY;
+	bip->bli_flags |= XFS_BLI_LOGGED;
+	xfs_buf_item_log(bip, first, last);
+	xfs_buf_item_trace("BLOG", bip);
+}
+
+
+/*
+ * This called to invalidate a buffer that is being used within
+ * a transaction.  Typically this is because the blocks in the
+ * buffer are being freed, so we need to prevent it from being
+ * written out when we're done.	 Allowing it to be written again
+ * might overwrite data in the free blocks if they are reallocated
+ * to a file.
+ *
+ * We prevent the buffer from being written out by clearing the
+ * B_DELWRI flag.  We can't always
+ * get rid of the buf log item at this point, though, because
+ * the buffer may still be pinned by another transaction.  If that
+ * is the case, then we'll wait until the buffer is committed to
+ * disk for the last time (we can tell by the ref count) and
+ * free it in xfs_buf_item_unpin().  Until it is cleaned up we
+ * will keep the buffer locked so that the buffer and buf log item
+ * are not reused.
+ */
+void
+xfs_trans_binval(
+	xfs_trans_t	*tp,
+	xfs_buf_t	*bp)
+{
+	xfs_log_item_desc_t	*lidp;
+	xfs_buf_log_item_t	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
+	ASSERT(lidp != NULL);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	if (bip->bli_flags & XFS_BLI_STALE) {
+		/*
+		 * If the buffer is already invalidated, then
+		 * just return.
+		 */
+		ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
+		ASSERT(XFS_BUF_ISSTALE(bp));
+		ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
+		ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF));
+		ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+		ASSERT(lidp->lid_flags & XFS_LID_DIRTY);
+		ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
+		xfs_buftrace("XFS_BINVAL RECUR", bp);
+		xfs_buf_item_trace("BINVAL RECUR", bip);
+		return;
+	}
+
+	/*
+	 * Clear the dirty bit in the buffer and set the STALE flag
+	 * in the buf log item.	 The STALE flag will be used in
+	 * xfs_buf_item_unpin() to determine if it should clean up
+	 * when the last reference to the buf item is given up.
+	 * We set the XFS_BLI_CANCEL flag in the buf log format structure
+	 * and log the buf item.  This will be used at recovery time
+	 * to determine that copies of the buffer in the log before
+	 * this should not be replayed.
+	 * We mark the item descriptor and the transaction dirty so
+	 * that we'll hold the buffer until after the commit.
+	 *
+	 * Since we're invalidating the buffer, we also clear the state
+	 * about which parts of the buffer have been logged.  We also
+	 * clear the flag indicating that this is an inode buffer since
+	 * the data in the buffer will no longer be valid.
+	 *
+	 * We set the stale bit in the buffer as well since we're getting
+	 * rid of it.
+	 */
+	XFS_BUF_UNDELAYWRITE(bp);
+	XFS_BUF_STALE(bp);
+	bip->bli_flags |= XFS_BLI_STALE;
+	bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY);
+	bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF;
+	bip->bli_format.blf_flags |= XFS_BLI_CANCEL;
+	bzero((char *)(bip->bli_format.blf_data_map),
+	      (bip->bli_format.blf_map_size * sizeof(uint)));
+	lidp->lid_flags |= XFS_LID_DIRTY;
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	xfs_buftrace("XFS_BINVAL", bp);
+	xfs_buf_item_trace("BINVAL", bip);
+}
+
+/*
+ * This call is used to indicate that the buffer contains on-disk
+ * inodes which must be handled specially during recovery.  They
+ * require special handling because only the di_next_unlinked from
+ * the inodes in the buffer should be recovered.  The rest of the
+ * data in the buffer is logged via the inodes themselves.
+ *
+ * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log
+ * format structure so that we'll know what to do at recovery time.
+ */
+/* ARGSUSED */
+void
+xfs_trans_inode_buf(
+	xfs_trans_t	*tp,
+	xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF;
+}
+
+
+/*
+ * Mark the buffer as being one which contains newly allocated
+ * inodes.  We need to make sure that even if this buffer is
+ * relogged as an 'inode buf' we still recover all of the inode
+ * images in the face of a crash.  This works in coordination with
+ * xfs_buf_item_committed() to ensure that the buffer remains in the
+ * AIL at its original location even after it has been relogged.
+ */
+/* ARGSUSED */
+void
+xfs_trans_inode_alloc_buf(
+	xfs_trans_t	*tp,
+	xfs_buf_t	*bp)
+{
+	xfs_buf_log_item_t	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+	ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF));
+
+	bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF;
+}
+
+
+/*
+ * Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of
+ * dquots. However, unlike in inode buffer recovery, dquot buffers get
+ * recovered in their entirety. (Hence, no XFS_BLI_DQUOT_ALLOC_BUF flag).
+ * The only thing that makes dquot buffers different from regular
+ * buffers is that we must not replay dquot bufs when recovering
+ * if a _corresponding_ quotaoff has happened. We also have to distinguish
+ * between usr dquot bufs and grp dquot bufs, because usr and grp quotas
+ * can be turned off independently.
+ */
+/* ARGSUSED */
+void
+xfs_trans_dquot_buf(
+	xfs_trans_t	*tp,
+	xfs_buf_t	*bp,
+	uint		type)
+{
+	xfs_buf_log_item_t	*bip;
+
+	ASSERT(XFS_BUF_ISBUSY(bp));
+	ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+	ASSERT(type == XFS_BLI_UDQUOT_BUF ||
+	       type == XFS_BLI_GDQUOT_BUF);
+
+	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	bip->bli_format.blf_flags |= type;
+}
+
+/*
+ * Check to see if a buffer matching the given parameters is already
+ * a part of the given transaction.  Only check the first, embedded
+ * chunk, since we don't want to spend all day scanning large transactions.
+ */
+STATIC xfs_buf_t *
+xfs_trans_buf_item_match(
+	xfs_trans_t	*tp,
+	xfs_buftarg_t	*target,
+	xfs_daddr_t	blkno,
+	int		len)
+{
+	xfs_log_item_chunk_t	*licp;
+	xfs_log_item_desc_t	*lidp;
+	xfs_buf_log_item_t	*blip;
+	xfs_buf_t		*bp;
+	int			i;
+
+	bp = NULL;
+	len = BBTOB(len);
+	licp = &tp->t_items;
+	if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+		for (i = 0; i < licp->lic_unused; i++) {
+			/*
+			 * Skip unoccupied slots.
+			 */
+			if (XFS_LIC_ISFREE(licp, i)) {
+				continue;
+			}
+
+			lidp = XFS_LIC_SLOT(licp, i);
+			blip = (xfs_buf_log_item_t *)lidp->lid_item;
+			if (blip->bli_item.li_type != XFS_LI_BUF) {
+				continue;
+			}
+
+			bp = blip->bli_buf;
+			if ((XFS_BUF_TARGET_DEV(bp) == target->pbr_dev) &&
+			    (XFS_BUF_ADDR(bp) == blkno) &&
+			    (XFS_BUF_COUNT(bp) == len)) {
+				/*
+				 * We found it.	 Break out and
+				 * return the pointer to the buffer.
+				 */
+				break;
+			} else {
+				bp = NULL;
+			}
+		}
+	}
+	return bp;
+}
+
+/*
+ * Check to see if a buffer matching the given parameters is already
+ * a part of the given transaction.  Check all the chunks, we
+ * want to be thorough.
+ */
+STATIC xfs_buf_t *
+xfs_trans_buf_item_match_all(
+	xfs_trans_t	*tp,
+	xfs_buftarg_t	*target,
+	xfs_daddr_t	blkno,
+	int		len)
+{
+	xfs_log_item_chunk_t	*licp;
+	xfs_log_item_desc_t	*lidp;
+	xfs_buf_log_item_t	*blip;
+	xfs_buf_t		*bp;
+	int			i;
+
+	bp = NULL;
+	len = BBTOB(len);
+	for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
+		if (XFS_LIC_ARE_ALL_FREE(licp)) {
+			ASSERT(licp == &tp->t_items);
+			ASSERT(licp->lic_next == NULL);
+			return NULL;
+		}
+		for (i = 0; i < licp->lic_unused; i++) {
+			/*
+			 * Skip unoccupied slots.
+			 */
+			if (XFS_LIC_ISFREE(licp, i)) {
+				continue;
+			}
+
+			lidp = XFS_LIC_SLOT(licp, i);
+			blip = (xfs_buf_log_item_t *)lidp->lid_item;
+			if (blip->bli_item.li_type != XFS_LI_BUF) {
+				continue;
+			}
+
+			bp = blip->bli_buf;
+			if ((XFS_BUF_TARGET_DEV(bp) == target->pbr_dev) &&
+			    (XFS_BUF_ADDR(bp) == blkno) &&
+			    (XFS_BUF_COUNT(bp) == len)) {
+				/*
+				 * We found it.	 Break out and
+				 * return the pointer to the buffer.
+				 */
+				return bp;
+			}
+		}
+	}
+	return NULL;
+}
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
new file mode 100644
index 000000000000..5078aff439b8
--- /dev/null
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -0,0 +1,852 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <xfs_quota_priv.h>
+
+
+/*
+ * Add the locked dquot to the transaction.
+ * The dquot must be locked, and it cannot be associated with any
+ * transaction.
+ */
+void
+xfs_trans_dqjoin(
+	xfs_trans_t	*tp,
+	xfs_dquot_t	*dqp)
+{
+	xfs_dq_logitem_t    *lp;
+
+	ASSERT(! XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	ASSERT(XFS_DQ_IS_LOGITEM_INITD(dqp));
+	lp = &dqp->q_logitem;
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)(lp));
+
+	/*
+	 * Initialize i_transp so we can later determine if this dquot is
+	 * associated with this transaction.
+	 */
+	dqp->q_transp = tp;
+}
+
+
+/*
+ * This is called to mark the dquot as needing
+ * to be logged when the transaction is committed.  The dquot must
+ * already be associated with the given transaction.
+ * Note that it marks the entire transaction as dirty. In the ordinary
+ * case, this gets called via xfs_trans_commit, after the transaction
+ * is already dirty. However, there's nothing stop this from getting
+ * called directly, as done by xfs_qm_scall_setqlim. Hence, the TRANS_DIRTY
+ * flag.
+ */
+void
+xfs_trans_log_dquot(
+	xfs_trans_t	*tp,
+	xfs_dquot_t	*dqp)
+{
+	xfs_log_item_desc_t	*lidp;
+
+	ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem));
+	ASSERT(lidp != NULL);
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	lidp->lid_flags |= XFS_LID_DIRTY;
+}
+
+/*
+ * Carry forward whatever is left of the quota blk reservation to
+ * the spanky new transaction
+ */
+void
+xfs_trans_dup_dqinfo(
+	xfs_trans_t	*otp,
+	xfs_trans_t	*ntp)
+{
+	xfs_dqtrx_t	*oq, *nq;
+	int		i,j;
+	xfs_dqtrx_t	*oqa, *nqa;
+
+	xfs_trans_alloc_dqinfo(ntp);
+	oqa = otp->t_dqinfo->dqa_usrdquots;
+	nqa = ntp->t_dqinfo->dqa_usrdquots;
+
+	/*
+	 * Because the quota blk reservation is carried forward,
+	 * it is also necessary to carry forward the DQ_DIRTY flag.
+	 */
+	if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
+		ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
+
+	for (j = 0; j < 2; j++) {
+		for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+			if (oqa[i].qt_dquot == NULL)
+				break;
+			oq = &oqa[i];
+			nq = &nqa[i];
+
+			nq->qt_dquot = oq->qt_dquot;
+			nq->qt_bcount_delta = nq->qt_icount_delta = 0;
+			nq->qt_rtbcount_delta = 0;
+
+			/*
+			 * Transfer whatever is left of the reservations.
+			 */
+			nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used;
+			oq->qt_blk_res = oq->qt_blk_res_used;
+
+			nq->qt_rtblk_res = oq->qt_rtblk_res -
+				oq->qt_rtblk_res_used;
+			oq->qt_rtblk_res = oq->qt_rtblk_res_used;
+
+			nq->qt_ino_res = oq->qt_ino_res - oq->qt_ino_res_used;
+			oq->qt_ino_res = oq->qt_ino_res_used;
+
+		}
+		oqa = otp->t_dqinfo->dqa_grpdquots;
+		nqa = ntp->t_dqinfo->dqa_grpdquots;
+	}
+}
+
+/*
+ * Wrap around mod_dquot to account for both user and group quotas.
+ */
+void
+xfs_trans_mod_dquot_byino(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	uint		field,
+	long		delta)
+{
+	ASSERT(tp);
+
+	if (tp->t_dqinfo == NULL)
+		xfs_trans_alloc_dqinfo(tp);
+
+	if (XFS_IS_UQUOTA_ON(tp->t_mountp) && ip->i_udquot) {
+		(void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta);
+	}
+	if (XFS_IS_GQUOTA_ON(tp->t_mountp) && ip->i_gdquot) {
+		(void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
+	}
+}
+
+STATIC xfs_dqtrx_t *
+xfs_trans_get_dqtrx(
+	xfs_trans_t	*tp,
+	xfs_dquot_t	*dqp)
+{
+	int		i;
+	xfs_dqtrx_t	*qa;
+
+	for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+		qa = XFS_QM_DQP_TO_DQACCT(tp, dqp);
+
+		if (qa[i].qt_dquot == NULL ||
+		    qa[i].qt_dquot == dqp) {
+			return (&qa[i]);
+		}
+	}
+
+	return (NULL);
+}
+
+/*
+ * Make the changes in the transaction structure.
+ * The moral equivalent to xfs_trans_mod_sb().
+ * We don't touch any fields in the dquot, so we don't care
+ * if it's locked or not (most of the time it won't be).
+ */
+void
+xfs_trans_mod_dquot(
+	xfs_trans_t	*tp,
+	xfs_dquot_t	*dqp,
+	uint		field,
+	long		delta)
+{
+	xfs_dqtrx_t	*qtrx;
+
+	ASSERT(tp);
+	qtrx = NULL;
+
+	if (tp->t_dqinfo == NULL)
+		xfs_trans_alloc_dqinfo(tp);
+	/*
+	 * Find either the first free slot or the slot that belongs
+	 * to this dquot.
+	 */
+	qtrx = xfs_trans_get_dqtrx(tp, dqp);
+	ASSERT(qtrx);
+	if (qtrx->qt_dquot == NULL)
+		qtrx->qt_dquot = dqp;
+
+	switch (field) {
+
+		/*
+		 * regular disk blk reservation
+		 */
+	      case XFS_TRANS_DQ_RES_BLKS:
+		qtrx->qt_blk_res += (ulong)delta;
+		break;
+
+		/*
+		 * inode reservation
+		 */
+	      case XFS_TRANS_DQ_RES_INOS:
+		qtrx->qt_ino_res += (ulong)delta;
+		break;
+
+		/*
+		 * disk blocks used.
+		 */
+	      case XFS_TRANS_DQ_BCOUNT:
+		if (qtrx->qt_blk_res && delta > 0) {
+			qtrx->qt_blk_res_used += (ulong)delta;
+			ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used);
+		}
+		qtrx->qt_bcount_delta += delta;
+		break;
+
+	      case XFS_TRANS_DQ_DELBCOUNT:
+		qtrx->qt_delbcnt_delta += delta;
+		break;
+
+		/*
+		 * Inode Count
+		 */
+	      case XFS_TRANS_DQ_ICOUNT:
+		if (qtrx->qt_ino_res && delta > 0) {
+			qtrx->qt_ino_res_used += (ulong)delta;
+			ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used);
+		}
+		qtrx->qt_icount_delta += delta;
+		break;
+
+		/*
+		 * rtblk reservation
+		 */
+	      case XFS_TRANS_DQ_RES_RTBLKS:
+		qtrx->qt_rtblk_res += (ulong)delta;
+		break;
+
+		/*
+		 * rtblk count
+		 */
+	      case XFS_TRANS_DQ_RTBCOUNT:
+		if (qtrx->qt_rtblk_res && delta > 0) {
+			qtrx->qt_rtblk_res_used += (ulong)delta;
+			ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used);
+		}
+		qtrx->qt_rtbcount_delta += delta;
+		break;
+
+	      case XFS_TRANS_DQ_DELRTBCOUNT:
+		qtrx->qt_delrtb_delta += delta;
+		break;
+
+	      default:
+		ASSERT(0);
+	}
+	tp->t_flags |= XFS_TRANS_DQ_DIRTY;
+}
+
+
+/*
+ * Given an array of dqtrx structures, lock all the dquots associated
+ * and join them to the transaction, provided they have been modified.
+ * We know that the highest number of dquots (of one type - usr OR grp),
+ * involved in a transaction is 2 and that both usr and grp combined - 3.
+ * So, we don't attempt to make this very generic.
+ */
+STATIC void
+xfs_trans_dqlockedjoin(
+	xfs_trans_t	*tp,
+	xfs_dqtrx_t	*q)
+{
+	ASSERT(q[0].qt_dquot != NULL);
+	if (q[1].qt_dquot == NULL) {
+		xfs_dqlock(q[0].qt_dquot);
+		xfs_trans_dqjoin(tp, q[0].qt_dquot);
+	} else {
+		ASSERT(XFS_QM_TRANS_MAXDQS == 2);
+		xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot);
+		xfs_trans_dqjoin(tp, q[0].qt_dquot);
+		xfs_trans_dqjoin(tp, q[1].qt_dquot);
+	}
+}
+
+
+/*
+ * Called by xfs_trans_commit() and similar in spirit to
+ * xfs_trans_apply_sb_deltas().
+ * Go thru all the dquots belonging to this transaction and modify the
+ * INCORE dquot to reflect the actual usages.
+ * Unreserve just the reservations done by this transaction
+ * dquot is still left locked at exit.
+ */
+void
+xfs_trans_apply_dquot_deltas(
+	xfs_trans_t		*tp)
+{
+	int			i, j;
+	xfs_dquot_t		*dqp;
+	xfs_dqtrx_t		*qtrx, *qa;
+	xfs_disk_dquot_t	*d;
+	long			totalbdelta;
+	long			totalrtbdelta;
+
+	ASSERT(tp->t_dqinfo);
+	qa = tp->t_dqinfo->dqa_usrdquots;
+	for (j = 0; j < 2; j++) {
+		if (qa[0].qt_dquot == NULL) {
+			qa = tp->t_dqinfo->dqa_grpdquots;
+			continue;
+		}
+
+		/*
+		 * Lock all of the dquots and join them to the transaction.
+		 */
+		xfs_trans_dqlockedjoin(tp, qa);
+
+		for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+			qtrx = &qa[i];
+			/*
+			 * The array of dquots is filled
+			 * sequentially, not sparsely.
+			 */
+			if ((dqp = qtrx->qt_dquot) == NULL)
+				break;
+
+			ASSERT(XFS_DQ_IS_LOCKED(dqp));
+			ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp));
+
+			/*
+			 * adjust the actual number of blocks used
+			 */
+			d = &dqp->q_core;
+
+			/*
+			 * The issue here is - sometimes we don't make a blkquota
+			 * reservation intentionally to be fair to users
+			 * (when the amount is small). On the other hand,
+			 * delayed allocs do make reservations, but that's
+			 * outside of a transaction, so we have no
+			 * idea how much was really reserved.
+			 * So, here we've accumulated delayed allocation blks and
+			 * non-delay blks. The assumption is that the
+			 * delayed ones are always reserved (outside of a
+			 * transaction), and the others may or may not have
+			 * quota reservations.
+			 */
+			totalbdelta = qtrx->qt_bcount_delta +
+				qtrx->qt_delbcnt_delta;
+			totalrtbdelta = qtrx->qt_rtbcount_delta +
+				qtrx->qt_delrtb_delta;
+#ifdef QUOTADEBUG
+			if (totalbdelta < 0)
+				ASSERT(INT_GET(d->d_bcount, ARCH_CONVERT) >=
+				       (xfs_qcnt_t) -totalbdelta);
+
+			if (totalrtbdelta < 0)
+				ASSERT(INT_GET(d->d_rtbcount, ARCH_CONVERT) >=
+				       (xfs_qcnt_t) -totalrtbdelta);
+
+			if (qtrx->qt_icount_delta < 0)
+				ASSERT(INT_GET(d->d_icount, ARCH_CONVERT) >=
+				       (xfs_qcnt_t) -qtrx->qt_icount_delta);
+#endif
+			if (totalbdelta)
+				INT_MOD(d->d_bcount, ARCH_CONVERT, (xfs_qcnt_t)totalbdelta);
+
+			if (qtrx->qt_icount_delta)
+				INT_MOD(d->d_icount, ARCH_CONVERT, (xfs_qcnt_t)qtrx->qt_icount_delta);
+
+			if (totalrtbdelta)
+				INT_MOD(d->d_rtbcount, ARCH_CONVERT, (xfs_qcnt_t)totalrtbdelta);
+
+			/*
+			 * Start/reset the timer(s) if needed.
+			 */
+			xfs_qm_adjust_dqtimers(tp->t_mountp, d);
+
+			dqp->dq_flags |= XFS_DQ_DIRTY;
+			/*
+			 * add this to the list of items to get logged
+			 */
+			xfs_trans_log_dquot(tp, dqp);
+			/*
+			 * Take off what's left of the original reservation.
+			 * In case of delayed allocations, there's no
+			 * reservation that a transaction structure knows of.
+			 */
+			if (qtrx->qt_blk_res != 0) {
+				if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) {
+					if (qtrx->qt_blk_res >
+					    qtrx->qt_blk_res_used)
+						dqp->q_res_bcount -= (xfs_qcnt_t)
+							(qtrx->qt_blk_res -
+							 qtrx->qt_blk_res_used);
+					else
+						dqp->q_res_bcount -= (xfs_qcnt_t)
+							(qtrx->qt_blk_res_used -
+							 qtrx->qt_blk_res);
+				}
+			} else {
+				/*
+				 * These blks were never reserved, either inside
+				 * a transaction or outside one (in a delayed
+				 * allocation). Also, this isn't always a
+				 * negative number since we sometimes
+				 * deliberately skip quota reservations.
+				 */
+				if (qtrx->qt_bcount_delta) {
+					dqp->q_res_bcount +=
+					      (xfs_qcnt_t)qtrx->qt_bcount_delta;
+				}
+			}
+			/*
+			 * Adjust the RT reservation.
+			 */
+			if (qtrx->qt_rtblk_res != 0) {
+				if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) {
+					if (qtrx->qt_rtblk_res >
+					    qtrx->qt_rtblk_res_used)
+					       dqp->q_res_rtbcount -= (xfs_qcnt_t)
+						       (qtrx->qt_rtblk_res -
+							qtrx->qt_rtblk_res_used);
+					else
+					       dqp->q_res_rtbcount -= (xfs_qcnt_t)
+						       (qtrx->qt_rtblk_res_used -
+							qtrx->qt_rtblk_res);
+				}
+			} else {
+				if (qtrx->qt_rtbcount_delta)
+					dqp->q_res_rtbcount +=
+					    (xfs_qcnt_t)qtrx->qt_rtbcount_delta;
+			}
+
+			/*
+			 * Adjust the inode reservation.
+			 */
+			if (qtrx->qt_ino_res != 0) {
+				ASSERT(qtrx->qt_ino_res >=
+				       qtrx->qt_ino_res_used);
+				if (qtrx->qt_ino_res > qtrx->qt_ino_res_used)
+					dqp->q_res_icount -= (xfs_qcnt_t)
+						(qtrx->qt_ino_res -
+						 qtrx->qt_ino_res_used);
+			} else {
+				if (qtrx->qt_icount_delta)
+					dqp->q_res_icount +=
+					    (xfs_qcnt_t)qtrx->qt_icount_delta;
+			}
+
+
+#ifdef QUOTADEBUG
+			if (qtrx->qt_rtblk_res != 0)
+				printk("RT res %d for 0x%p\n",
+				      (int) qtrx->qt_rtblk_res,
+				      dqp);
+#endif
+			ASSERT(dqp->q_res_bcount >= INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT));
+			ASSERT(dqp->q_res_icount >= INT_GET(dqp->q_core.d_icount, ARCH_CONVERT));
+			ASSERT(dqp->q_res_rtbcount >= INT_GET(dqp->q_core.d_rtbcount, ARCH_CONVERT));
+		}
+		/*
+		 * Do the group quotas next
+		 */
+		qa = tp->t_dqinfo->dqa_grpdquots;
+	}
+}
+
+/*
+ * Release the reservations, and adjust the dquots accordingly.
+ * This is called only when the transaction is being aborted. If by
+ * any chance we have done dquot modifications incore (ie. deltas) already,
+ * we simply throw those away, since that's the expected behavior
+ * when a transaction is curtailed without a commit.
+ */
+void
+xfs_trans_unreserve_and_mod_dquots(
+	xfs_trans_t	*tp)
+{
+	int			i, j;
+	xfs_dquot_t		*dqp;
+	xfs_dqtrx_t		*qtrx, *qa;
+	boolean_t		locked;
+
+	ASSERT(tp->t_dqinfo);
+	qa = tp->t_dqinfo->dqa_usrdquots;
+
+	for (j = 0; j < 2; j++) {
+		for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+			qtrx = &qa[i];
+			/*
+			 * We assume that the array of dquots is filled
+			 * sequentially, not sparsely.
+			 */
+			if ((dqp = qtrx->qt_dquot) == NULL)
+				break;
+			/*
+			 * Unreserve the original reservation. We don't care
+			 * about the number of blocks used field, or deltas.
+			 * Also we don't bother to zero the fields.
+			 */
+			locked = B_FALSE;
+			if (qtrx->qt_blk_res) {
+				xfs_dqlock(dqp);
+				locked = B_TRUE;
+				dqp->q_res_bcount -=
+					(xfs_qcnt_t)qtrx->qt_blk_res;
+			}
+			if (qtrx->qt_ino_res) {
+				if (!locked) {
+					xfs_dqlock(dqp);
+					locked = B_TRUE;
+				}
+				dqp->q_res_icount -=
+					(xfs_qcnt_t)qtrx->qt_ino_res;
+			}
+
+			if (qtrx->qt_rtblk_res) {
+				if (!locked) {
+					xfs_dqlock(dqp);
+					locked = B_TRUE;
+				}
+				dqp->q_res_rtbcount -=
+					(xfs_qcnt_t)qtrx->qt_rtblk_res;
+			}
+			if (locked)
+				xfs_dqunlock(dqp);
+
+		}
+		qa = tp->t_dqinfo->dqa_grpdquots;
+	}
+}
+
+/*
+ * This reserves disk blocks and inodes against a dquot.
+ * Flags indicate if the dquot is to be locked here and also
+ * if the blk reservation is for RT or regular blocks.
+ * Sending in XFS_QMOPT_FORCE_RES flag skips the quota check.
+ * Returns EDQUOT if quota is exceeded.
+ */
+STATIC int
+xfs_trans_dqresv(
+	xfs_trans_t	*tp,
+	xfs_dquot_t	*dqp,
+	long		nblks,
+	long		ninos,
+	uint		flags)
+{
+	int		error;
+	xfs_qcnt_t	hardlimit;
+	xfs_qcnt_t	softlimit;
+	time_t		btimer;
+	xfs_qcnt_t	*resbcountp;
+
+	if (! (flags & XFS_QMOPT_DQLOCK)) {
+		xfs_dqlock(dqp);
+	}
+	ASSERT(XFS_DQ_IS_LOCKED(dqp));
+	if (flags & XFS_TRANS_DQ_RES_BLKS) {
+		hardlimit = INT_GET(dqp->q_core.d_blk_hardlimit, ARCH_CONVERT);
+		softlimit = INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT);
+		btimer = INT_GET(dqp->q_core.d_btimer, ARCH_CONVERT);
+		resbcountp = &dqp->q_res_bcount;
+	} else {
+		ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
+		hardlimit = INT_GET(dqp->q_core.d_rtb_hardlimit, ARCH_CONVERT);
+		softlimit = INT_GET(dqp->q_core.d_rtb_softlimit, ARCH_CONVERT);
+		btimer = INT_GET(dqp->q_core.d_rtbtimer, ARCH_CONVERT);
+		resbcountp = &dqp->q_res_rtbcount;
+	}
+	error = 0;
+
+	if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
+	    !INT_ISZERO(dqp->q_core.d_id, ARCH_CONVERT) &&
+	    XFS_IS_QUOTA_ENFORCED(dqp->q_mount)) {
+#ifdef QUOTADEBUG
+		printk("BLK Res: nblks=%ld + resbcount=%Ld > hardlimit=%Ld?\n",
+			nblks, *resbcountp, hardlimit);
+#endif
+		if (nblks > 0) {
+			/*
+			 * dquot is locked already. See if we'd go over the
+			 * hardlimit or exceed the timelimit if we allocate
+			 * nblks.
+			 */
+			if (hardlimit > 0ULL &&
+			     (hardlimit <= nblks + *resbcountp)) {
+				error = EDQUOT;
+				goto error_return;
+			}
+
+			if (softlimit > 0ULL &&
+			     (softlimit <= nblks + *resbcountp)) {
+				/*
+				 * If timer or warnings has expired,
+				 * return EDQUOT
+				 */
+				if ((btimer != 0 && CURRENT_TIME > btimer) ||
+				    (!INT_ISZERO(dqp->q_core.d_bwarns, ARCH_CONVERT) &&
+				     INT_GET(dqp->q_core.d_bwarns, ARCH_CONVERT) >=
+				     XFS_QI_BWARNLIMIT(dqp->q_mount))) {
+					error = EDQUOT;
+					goto error_return;
+				}
+			}
+		}
+		if (ninos > 0) {
+			if (INT_GET(dqp->q_core.d_ino_hardlimit, ARCH_CONVERT) > 0ULL &&
+			    INT_GET(dqp->q_core.d_icount, ARCH_CONVERT) >=
+			    INT_GET(dqp->q_core.d_ino_hardlimit, ARCH_CONVERT)) {
+				error = EDQUOT;
+				goto error_return;
+			} else if (INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT) > 0ULL &&
+				   INT_GET(dqp->q_core.d_icount, ARCH_CONVERT) >=
+				   INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT)) {
+				/*
+				 * If timer or warnings has expired,
+				 * return EDQUOT
+				 */
+				if ((!INT_ISZERO(dqp->q_core.d_itimer, ARCH_CONVERT) &&
+				     CURRENT_TIME > INT_GET(dqp->q_core.d_itimer, ARCH_CONVERT)) ||
+				    (!INT_ISZERO(dqp->q_core.d_iwarns, ARCH_CONVERT) &&
+				     INT_GET(dqp->q_core.d_iwarns, ARCH_CONVERT) >=
+				     XFS_QI_IWARNLIMIT(dqp->q_mount))) {
+					error = EDQUOT;
+					goto error_return;
+				}
+			}
+		}
+	}
+
+	/*
+	 * Change the reservation, but not the actual usage.
+	 * Note that q_res_bcount = q_core.d_bcount + resv
+	 */
+	(*resbcountp) += (xfs_qcnt_t)nblks;
+	if (ninos != 0)
+		dqp->q_res_icount += (xfs_qcnt_t)ninos;
+
+	/*
+	 * note the reservation amt in the trans struct too,
+	 * so that the transaction knows how much was reserved by
+	 * it against this particular dquot.
+	 * We don't do this when we are reserving for a delayed allocation,
+	 * because we don't have the luxury of a transaction envelope then.
+	 */
+	if (tp) {
+		ASSERT(tp->t_dqinfo);
+		ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
+		if (nblks != 0)
+			xfs_trans_mod_dquot(tp, dqp,
+					    flags & XFS_QMOPT_RESBLK_MASK,
+					    nblks);
+		if (ninos != 0)
+			xfs_trans_mod_dquot(tp, dqp,
+					    XFS_TRANS_DQ_RES_INOS,
+					    ninos);
+	}
+	ASSERT(dqp->q_res_bcount >= INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT));
+	ASSERT(dqp->q_res_rtbcount >= INT_GET(dqp->q_core.d_rtbcount, ARCH_CONVERT));
+	ASSERT(dqp->q_res_icount >= INT_GET(dqp->q_core.d_icount, ARCH_CONVERT));
+
+error_return:
+	if (! (flags & XFS_QMOPT_DQLOCK)) {
+		xfs_dqunlock(dqp);
+	}
+	return (error);
+}
+
+
+/*
+ * Given a dquot(s), make disk block and/or inode reservations against them.
+ * The fact that this does the reservation against both the usr and
+ * grp quotas is important, because this follows a both-or-nothing
+ * approach.
+ *
+ * flags = XFS_QMOPT_DQLOCK indicate if dquot(s) need to be locked.
+ *	   XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
+ *	   XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks
+ *	   XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks
+ * dquots are unlocked on return, if they were not locked by caller.
+ */
+int
+xfs_trans_reserve_quota_bydquots(
+	xfs_trans_t	*tp,
+	xfs_dquot_t	*udqp,
+	xfs_dquot_t	*gdqp,
+	long		nblks,
+	long		ninos,
+	uint		flags)
+{
+	int		resvd;
+
+	if (tp && tp->t_dqinfo == NULL)
+		xfs_trans_alloc_dqinfo(tp);
+
+	ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
+	resvd = 0;
+
+	if (udqp) {
+		if (xfs_trans_dqresv(tp, udqp, nblks, ninos, flags))
+			return (EDQUOT);
+		resvd = 1;
+	}
+
+	if (gdqp) {
+		if (xfs_trans_dqresv(tp, gdqp, nblks, ninos, flags)) {
+			/*
+			 * can't do it, so backout previous reservation
+			 */
+			if (resvd) {
+				xfs_trans_dqresv(tp, udqp,  -nblks, -ninos,
+						 flags);
+			}
+			return (EDQUOT);
+		}
+	}
+
+	/*
+	 * Didnt change anything critical, so, no need to log
+	 */
+	return (0);
+}
+
+
+/*
+ * Lock the dquot and change the reservation if we can.
+ * This doesnt change the actual usage, just the reservation.
+ * The inode sent in is locked.
+ *
+ * Returns 0 on success, EDQUOT or other errors otherwise
+ */
+int
+xfs_trans_reserve_quota_nblks(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	long		nblks,
+	long		ninos,
+	uint		type)
+{
+	int error;
+
+#ifdef QUOTADEBUG
+	if (ip->i_udquot)
+		ASSERT(! XFS_DQ_IS_LOCKED(ip->i_udquot));
+	if (ip->i_gdquot)
+		ASSERT(! XFS_DQ_IS_LOCKED(ip->i_gdquot));
+#endif
+
+	ASSERT(XFS_ISLOCKED_INODE_EXCL(ip));
+	ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
+	ASSERT((type & ~XFS_QMOPT_FORCE_RES) == XFS_TRANS_DQ_RES_RTBLKS ||
+	       (type & ~XFS_QMOPT_FORCE_RES) == XFS_TRANS_DQ_RES_BLKS);
+
+	/*
+	 * Reserve nblks against these dquots, with trans as the mediator.
+	 */
+	error = xfs_trans_reserve_quota_bydquots(tp,
+						 ip->i_udquot, ip->i_gdquot,
+						 nblks, ninos,
+						 type);
+	return (error);
+}
+
+/*
+ * This routine is called to allocate a quotaoff log item.
+ */
+xfs_qoff_logitem_t *
+xfs_trans_get_qoff_item(
+	xfs_trans_t		*tp,
+	xfs_qoff_logitem_t	*startqoff,
+	uint			flags)
+{
+	xfs_qoff_logitem_t	*q;
+
+	ASSERT(tp != NULL);
+
+	q = xfs_qm_qoff_logitem_init(tp->t_mountp, startqoff, flags);
+	ASSERT(q != NULL);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)q);
+
+	return (q);
+}
+
+
+/*
+ * This is called to mark the quotaoff logitem as needing
+ * to be logged when the transaction is committed.  The logitem must
+ * already be associated with the given transaction.
+ */
+void
+xfs_trans_log_quotaoff_item(
+	xfs_trans_t		*tp,
+	xfs_qoff_logitem_t	*qlp)
+{
+	xfs_log_item_desc_t	*lidp;
+
+	lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)qlp);
+	ASSERT(lidp != NULL);
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	lidp->lid_flags |= XFS_LID_DIRTY;
+}
+
+void
+xfs_trans_alloc_dqinfo(
+	xfs_trans_t	*tp)
+{
+	(tp)->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP);
+}
+
+void
+xfs_trans_free_dqinfo(
+	xfs_trans_t	*tp)
+{
+	kmem_zone_free(xfs_Gqm->qm_dqtrxzone, (tp)->t_dqinfo);
+	(tp)->t_dqinfo = NULL;
+}
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
new file mode 100644
index 000000000000..7ac26a45ccb6
--- /dev/null
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+/*
+ * This routine is called to allocate an "extent free intention"
+ * log item that will hold nextents worth of extents.  The
+ * caller must use all nextents extents, because we are not
+ * flexible about this at all.
+ */
+xfs_efi_log_item_t *
+xfs_trans_get_efi(xfs_trans_t	*tp,
+		  uint		nextents)
+{
+	xfs_efi_log_item_t	*efip;
+
+	ASSERT(tp != NULL);
+	ASSERT(nextents > 0);
+
+	efip = xfs_efi_init(tp->t_mountp, nextents);
+	ASSERT(efip != NULL);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)efip);
+
+	return (efip);
+}
+
+/*
+ * This routine is called to indicate that the described
+ * extent is to be logged as needing to be freed.  It should
+ * be called once for each extent to be freed.
+ */
+void
+xfs_trans_log_efi_extent(xfs_trans_t		*tp,
+			 xfs_efi_log_item_t	*efip,
+			 xfs_fsblock_t		start_block,
+			 xfs_extlen_t		ext_len)
+{
+	xfs_log_item_desc_t	*lidp;
+	uint			next_extent;
+	xfs_extent_t		*extp;
+
+	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efip);
+	ASSERT(lidp != NULL);
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	lidp->lid_flags |= XFS_LID_DIRTY;
+
+	next_extent = efip->efi_next_extent;
+	ASSERT(next_extent < efip->efi_format.efi_nextents);
+	extp = &(efip->efi_format.efi_extents[next_extent]);
+	extp->ext_start = start_block;
+	extp->ext_len = ext_len;
+	efip->efi_next_extent++;
+}
+
+
+/*
+ * This routine is called to allocate an "extent free done"
+ * log item that will hold nextents worth of extents.  The
+ * caller must use all nextents extents, because we are not
+ * flexible about this at all.
+ */
+xfs_efd_log_item_t *
+xfs_trans_get_efd(xfs_trans_t		*tp,
+		  xfs_efi_log_item_t	*efip,
+		  uint			nextents)
+{
+	xfs_efd_log_item_t	*efdp;
+
+	ASSERT(tp != NULL);
+	ASSERT(nextents > 0);
+
+	efdp = xfs_efd_init(tp->t_mountp, efip, nextents);
+	ASSERT(efdp != NULL);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)efdp);
+
+	return (efdp);
+}
+
+/*
+ * This routine is called to indicate that the described
+ * extent is to be logged as having been freed.	 It should
+ * be called once for each extent freed.
+ */
+void
+xfs_trans_log_efd_extent(xfs_trans_t		*tp,
+			 xfs_efd_log_item_t	*efdp,
+			 xfs_fsblock_t		start_block,
+			 xfs_extlen_t		ext_len)
+{
+	xfs_log_item_desc_t	*lidp;
+	uint			next_extent;
+	xfs_extent_t		*extp;
+
+	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efdp);
+	ASSERT(lidp != NULL);
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	lidp->lid_flags |= XFS_LID_DIRTY;
+
+	next_extent = efdp->efd_next_extent;
+	ASSERT(next_extent < efdp->efd_format.efd_nextents);
+	extp = &(efdp->efd_format.efd_extents[next_extent]);
+	extp->ext_start = start_block;
+	extp->ext_len = ext_len;
+	efdp->efd_next_extent++;
+}
+
+
+
+
+
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
new file mode 100644
index 000000000000..b7fbf81b00da
--- /dev/null
+++ b/fs/xfs/xfs_trans_inode.c
@@ -0,0 +1,433 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+#ifdef XFS_TRANS_DEBUG
+STATIC void
+xfs_trans_inode_broot_debug(
+	xfs_inode_t	*ip);
+#else
+#define xfs_trans_inode_broot_debug(ip)
+#endif
+
+
+/*
+ * Get and lock the inode for the caller if it is not already
+ * locked within the given transaction.	 If it is already locked
+ * within the transaction, just increment its lock recursion count
+ * and return a pointer to it.
+ *
+ * For an inode to be locked in a transaction, the inode lock, as
+ * opposed to the io lock, must be taken exclusively.  This ensures
+ * that the inode can be involved in only 1 transaction at a time.
+ * Lock recursion is handled on the io lock, but only for lock modes
+ * of equal or lesser strength.	 That is, you can recur on the io lock
+ * held EXCL with a SHARED request but not vice versa.	Also, if
+ * the inode is already a part of the transaction then you cannot
+ * go from not holding the io lock to having it EXCL or SHARED.
+ *
+ * Use the inode cache routine xfs_inode_incore() to find the inode
+ * if it is already owned by this transaction.
+ *
+ * If we don't already own the inode, use xfs_iget() to get it.
+ * Since the inode log item structure is embedded in the incore
+ * inode structure and is initialized when the inode is brought
+ * into memory, there is nothing to do with it here.
+ *
+ * If the given transaction pointer is NULL, just call xfs_iget().
+ * This simplifies code which must handle both cases.
+ */
+int
+xfs_trans_iget(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_ino_t	ino,
+	uint		lock_flags,
+	xfs_inode_t	**ipp)
+{
+	int			error;
+	xfs_inode_t		*ip;
+	xfs_inode_log_item_t	*iip;
+
+	/*
+	 * If the transaction pointer is NULL, just call the normal
+	 * xfs_iget().
+	 */
+	if (tp == NULL) {
+		return (xfs_iget(mp, NULL, ino, lock_flags, ipp, 0));
+	}
+
+	/*
+	 * If we find the inode in core with this transaction
+	 * pointer in its i_transp field, then we know we already
+	 * have it locked.  In this case we just increment the lock
+	 * recursion count and return the inode to the caller.
+	 * Assert that the inode is already locked in the mode requested
+	 * by the caller.  We cannot do lock promotions yet, so
+	 * die if someone gets this wrong.
+	 */
+	if ((ip = xfs_inode_incore(tp->t_mountp, ino, tp)) != NULL) {
+		/*
+		 * Make sure that the inode lock is held EXCL and
+		 * that the io lock is never upgraded when the inode
+		 * is already a part of the transaction.
+		 */
+		ASSERT(ip->i_itemp != NULL);
+		ASSERT(lock_flags & XFS_ILOCK_EXCL);
+		ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+		ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
+		       ismrlocked(&ip->i_iolock, MR_UPDATE));
+		ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
+		       (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL));
+		ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
+		       ismrlocked(&ip->i_iolock, (MR_UPDATE | MR_ACCESS)));
+		ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
+		       (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_ANY));
+
+		if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) {
+			ip->i_itemp->ili_iolock_recur++;
+		}
+		if (lock_flags & XFS_ILOCK_EXCL) {
+			ip->i_itemp->ili_ilock_recur++;
+		}
+		*ipp = ip;
+		return 0;
+	}
+
+	ASSERT(lock_flags & XFS_ILOCK_EXCL);
+	error = xfs_iget(tp->t_mountp, tp, ino, lock_flags, &ip, 0);
+	if (error) {
+		return error;
+	}
+	ASSERT(ip != NULL);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	if (ip->i_itemp == NULL)
+		xfs_inode_item_init(ip, mp);
+	iip = ip->i_itemp;
+	(void) xfs_trans_add_item(tp, (xfs_log_item_t *)(iip));
+
+	xfs_trans_inode_broot_debug(ip);
+
+	/*
+	 * If the IO lock has been acquired, mark that in
+	 * the inode log item so we'll know to unlock it
+	 * when the transaction commits.
+	 */
+	ASSERT(iip->ili_flags == 0);
+	if (lock_flags & XFS_IOLOCK_EXCL) {
+		iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL;
+	} else if (lock_flags & XFS_IOLOCK_SHARED) {
+		iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED;
+	}
+
+	/*
+	 * Initialize i_transp so we can find it with xfs_inode_incore()
+	 * above.
+	 */
+	ip->i_transp = tp;
+
+	*ipp = ip;
+	return 0;
+}
+
+
+/*
+ * Release the inode ip which was previously acquired with xfs_trans_iget()
+ * or added with xfs_trans_ijoin(). This will decrement the lock
+ * recursion count of the inode item.  If the count goes to less than 0,
+ * the inode will be unlocked and disassociated from the transaction.
+ *
+ * If the inode has been modified within the transaction, it will not be
+ * unlocked until the transaction commits.
+ */
+void
+xfs_trans_iput(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	uint		lock_flags)
+{
+	xfs_inode_log_item_t	*iip;
+	xfs_log_item_desc_t	*lidp;
+
+	/*
+	 * If the transaction pointer is NULL, just call xfs_iput().
+	 */
+	if (tp == NULL) {
+		xfs_iput(ip, lock_flags);
+	}
+
+	ASSERT(ip->i_transp == tp);
+	iip = ip->i_itemp;
+	ASSERT(iip != NULL);
+
+	/*
+	 * Find the item descriptor pointing to this inode's
+	 * log item.  It must be there.
+	 */
+	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)iip);
+	ASSERT(lidp != NULL);
+	ASSERT(lidp->lid_item == (xfs_log_item_t*)iip);
+
+	/*
+	 * Be consistent about the bookkeeping for the inode's
+	 * io lock, but it doesn't mean much really.
+	 */
+	ASSERT((iip->ili_flags & XFS_ILI_IOLOCKED_ANY) != XFS_ILI_IOLOCKED_ANY);
+	if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
+		ASSERT(iip->ili_flags & XFS_ILI_IOLOCKED_ANY);
+		ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
+		       (iip->ili_flags & XFS_ILI_IOLOCKED_EXCL));
+		ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
+		       (iip->ili_flags &
+			(XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)));
+		if (iip->ili_iolock_recur > 0) {
+			iip->ili_iolock_recur--;
+		}
+	}
+
+	/*
+	 * If the release is just for a recursive lock on the inode lock,
+	 * then decrement the count and return.	 We can assert that
+	 * the caller is dropping an EXCL lock on the inode, because
+	 * inode must be locked EXCL within transactions.
+	 */
+	ASSERT(lock_flags & XFS_ILOCK_EXCL);
+	if (iip->ili_ilock_recur > 0) {
+		iip->ili_ilock_recur--;
+		return;
+	}
+	ASSERT(iip->ili_iolock_recur == 0);
+
+	/*
+	 * If the inode was dirtied within this transaction, it cannot
+	 * be released until the transaction commits.
+	 */
+	if (lidp->lid_flags & XFS_LID_DIRTY) {
+		return;
+	}
+
+	xfs_trans_free_item(tp, lidp);
+
+	/*
+	 * Clear the hold and iolocked flags in the inode log item.
+	 * We wouldn't want the next user of the inode to
+	 * get confused.  Assert that if the iolocked flag is set
+	 * in the item then we are unlocking it in the call to xfs_iput()
+	 * below.
+	 */
+	ASSERT((!(iip->ili_flags & XFS_ILI_IOLOCKED_ANY)) ||
+	       (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)));
+	if (iip->ili_flags & (XFS_ILI_HOLD | XFS_ILI_IOLOCKED_ANY)) {
+		iip->ili_flags &= ~(XFS_ILI_HOLD | XFS_ILI_IOLOCKED_ANY);
+	}
+
+	/*
+	 * Unlike xfs_brelse() the inode log item cannot be
+	 * freed, because it is embedded within the inode.
+	 * All we have to do is release the inode.
+	 */
+	xfs_iput(ip, lock_flags);
+	return;
+}
+
+
+/*
+ * Add the locked inode to the transaction.
+ * The inode must be locked, and it cannot be associated with any
+ * transaction.	 The caller must specify the locks already held
+ * on the inode.
+ */
+void
+xfs_trans_ijoin(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	uint		lock_flags)
+{
+	xfs_inode_log_item_t	*iip;
+
+	ASSERT(ip->i_transp == NULL);
+	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+	ASSERT(lock_flags & XFS_ILOCK_EXCL);
+	if (ip->i_itemp == NULL)
+		xfs_inode_item_init(ip, ip->i_mount);
+	iip = ip->i_itemp;
+	ASSERT(iip->ili_flags == 0);
+	ASSERT(iip->ili_ilock_recur == 0);
+	ASSERT(iip->ili_iolock_recur == 0);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	(void) xfs_trans_add_item(tp, (xfs_log_item_t*)(iip));
+
+	xfs_trans_inode_broot_debug(ip);
+
+	/*
+	 * If the IO lock is already held, mark that in the inode log item.
+	 */
+	if (lock_flags & XFS_IOLOCK_EXCL) {
+		iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL;
+	} else if (lock_flags & XFS_IOLOCK_SHARED) {
+		iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED;
+	}
+
+	/*
+	 * Initialize i_transp so we can find it with xfs_inode_incore()
+	 * in xfs_trans_iget() above.
+	 */
+	ip->i_transp = tp;
+}
+
+
+
+/*
+ * Mark the inode as not needing to be unlocked when the inode item's
+ * IOP_UNLOCK() routine is called.  The inode must already be locked
+ * and associated with the given transaction.
+ */
+/*ARGSUSED*/
+void
+xfs_trans_ihold(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip)
+{
+	ASSERT(ip->i_transp == tp);
+	ASSERT(ip->i_itemp != NULL);
+	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+
+	ip->i_itemp->ili_flags |= XFS_ILI_HOLD;
+}
+
+/*
+ * Cancel the previous inode hold request made on this inode
+ * for this transaction.
+ */
+/*ARGSUSED*/
+void
+xfs_trans_ihold_release(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip)
+{
+	ASSERT(ip->i_transp == tp);
+	ASSERT(ip->i_itemp != NULL);
+	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+	ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD);
+
+	ip->i_itemp->ili_flags &= ~XFS_ILI_HOLD;
+}
+
+
+/*
+ * This is called to mark the fields indicated in fieldmask as needing
+ * to be logged when the transaction is committed.  The inode must
+ * already be associated with the given transaction.
+ *
+ * The values for fieldmask are defined in xfs_inode_item.h.  We always
+ * log all of the core inode if any of it has changed, and we always log
+ * all of the inline data/extents/b-tree root if any of them has changed.
+ */
+void
+xfs_trans_log_inode(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip,
+	uint		flags)
+{
+	xfs_log_item_desc_t	*lidp;
+
+	ASSERT(ip->i_transp == tp);
+	ASSERT(ip->i_itemp != NULL);
+	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
+
+	lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(ip->i_itemp));
+	ASSERT(lidp != NULL);
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	lidp->lid_flags |= XFS_LID_DIRTY;
+
+	/*
+	 * Always OR in the bits from the ili_last_fields field.
+	 * This is to coordinate with the xfs_iflush() and xfs_iflush_done()
+	 * routines in the eventual clearing of the ilf_fields bits.
+	 * See the big comment in xfs_iflush() for an explanation of
+	 * this coorination mechanism.
+	 */
+	flags |= ip->i_itemp->ili_last_fields;
+	ip->i_itemp->ili_format.ilf_fields |= flags;
+}
+
+#ifdef XFS_TRANS_DEBUG
+/*
+ * Keep track of the state of the inode btree root to make sure we
+ * log it properly.
+ */
+STATIC void
+xfs_trans_inode_broot_debug(
+	xfs_inode_t	*ip)
+{
+	xfs_inode_log_item_t	*iip;
+
+	ASSERT(ip->i_itemp != NULL);
+	iip = ip->i_itemp;
+	if (iip->ili_root_size != 0) {
+		ASSERT(iip->ili_orig_root != NULL);
+		kmem_free(iip->ili_orig_root, iip->ili_root_size);
+		iip->ili_root_size = 0;
+		iip->ili_orig_root = NULL;
+	}
+	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+		ASSERT((ip->i_df.if_broot != NULL) &&
+		       (ip->i_df.if_broot_bytes > 0));
+		iip->ili_root_size = ip->i_df.if_broot_bytes;
+		iip->ili_orig_root =
+			(char*)kmem_alloc(iip->ili_root_size, KM_SLEEP);
+		bcopy((char*)(ip->i_df.if_broot), iip->ili_orig_root,
+		      iip->ili_root_size);
+	}
+}
+#endif
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
new file mode 100644
index 000000000000..a0ddb32a5b5a
--- /dev/null
+++ b/fs/xfs/xfs_trans_item.c
@@ -0,0 +1,557 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+
+STATIC int	xfs_trans_unlock_chunk(xfs_log_item_chunk_t *,
+					int, int, xfs_lsn_t);
+
+/*
+ * This is called to add the given log item to the transaction's
+ * list of log items.  It must find a free log item descriptor
+ * or allocate a new one and add the item to that descriptor.
+ * The function returns a pointer to item descriptor used to point
+ * to the new item.  The log item will now point to its new descriptor
+ * with its li_desc field.
+ */
+xfs_log_item_desc_t *
+xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
+{
+	xfs_log_item_desc_t	*lidp;
+	xfs_log_item_chunk_t	*licp;
+	int			i=0;
+
+	/*
+	 * If there are no free descriptors, allocate a new chunk
+	 * of them and put it at the front of the chunk list.
+	 */
+	if (tp->t_items_free == 0) {
+		licp = (xfs_log_item_chunk_t*)
+		       kmem_alloc(sizeof(xfs_log_item_chunk_t), KM_SLEEP);
+		ASSERT(licp != NULL);
+		/*
+		 * Initialize the chunk, and then
+		 * claim the first slot in the newly allocated chunk.
+		 */
+		XFS_LIC_INIT(licp);
+		XFS_LIC_CLAIM(licp, 0);
+		licp->lic_unused = 1;
+		XFS_LIC_INIT_SLOT(licp, 0);
+		lidp = XFS_LIC_SLOT(licp, 0);
+
+		/*
+		 * Link in the new chunk and update the free count.
+		 */
+		licp->lic_next = tp->t_items.lic_next;
+		tp->t_items.lic_next = licp;
+		tp->t_items_free = XFS_LIC_NUM_SLOTS - 1;
+
+		/*
+		 * Initialize the descriptor and the generic portion
+		 * of the log item.
+		 *
+		 * Point the new slot at this item and return it.
+		 * Also point the log item at its currently active
+		 * descriptor and set the item's mount pointer.
+		 */
+		lidp->lid_item = lip;
+		lidp->lid_flags = 0;
+		lidp->lid_size = 0;
+		lip->li_desc = lidp;
+		lip->li_mountp = tp->t_mountp;
+		return (lidp);
+	}
+
+	/*
+	 * Find the free descriptor. It is somewhere in the chunklist
+	 * of descriptors.
+	 */
+	licp = &tp->t_items;
+	while (licp != NULL) {
+		if (XFS_LIC_VACANCY(licp)) {
+			if (licp->lic_unused <= XFS_LIC_MAX_SLOT) {
+				i = licp->lic_unused;
+				ASSERT(XFS_LIC_ISFREE(licp, i));
+				break;
+			}
+			for (i = 0; i <= XFS_LIC_MAX_SLOT; i++) {
+				if (XFS_LIC_ISFREE(licp, i))
+					break;
+			}
+			ASSERT(i <= XFS_LIC_MAX_SLOT);
+			break;
+		}
+		licp = licp->lic_next;
+	}
+	ASSERT(licp != NULL);
+	/*
+	 * If we find a free descriptor, claim it,
+	 * initialize it, and return it.
+	 */
+	XFS_LIC_CLAIM(licp, i);
+	if (licp->lic_unused <= i) {
+		licp->lic_unused = i + 1;
+		XFS_LIC_INIT_SLOT(licp, i);
+	}
+	lidp = XFS_LIC_SLOT(licp, i);
+	tp->t_items_free--;
+	lidp->lid_item = lip;
+	lidp->lid_flags = 0;
+	lidp->lid_size = 0;
+	lip->li_desc = lidp;
+	lip->li_mountp = tp->t_mountp;
+	return (lidp);
+}
+
+/*
+ * Free the given descriptor.
+ *
+ * This requires setting the bit in the chunk's free mask corresponding
+ * to the given slot.
+ */
+void
+xfs_trans_free_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
+{
+	uint			slot;
+	xfs_log_item_chunk_t	*licp;
+	xfs_log_item_chunk_t	**licpp;
+
+	slot = XFS_LIC_DESC_TO_SLOT(lidp);
+	licp = XFS_LIC_DESC_TO_CHUNK(lidp);
+	XFS_LIC_RELSE(licp, slot);
+	lidp->lid_item->li_desc = NULL;
+	tp->t_items_free++;
+
+	/*
+	 * If there are no more used items in the chunk and this is not
+	 * the chunk embedded in the transaction structure, then free
+	 * the chunk. First pull it from the chunk list and then
+	 * free it back to the heap.  We didn't bother with a doubly
+	 * linked list here because the lists should be very short
+	 * and this is not a performance path.	It's better to save
+	 * the memory of the extra pointer.
+	 *
+	 * Also decrement the transaction structure's count of free items
+	 * by the number in a chunk since we are freeing an empty chunk.
+	 */
+	if (XFS_LIC_ARE_ALL_FREE(licp) && (licp != &(tp->t_items))) {
+		licpp = &(tp->t_items.lic_next);
+		while (*licpp != licp) {
+			ASSERT(*licpp != NULL);
+			licpp = &((*licpp)->lic_next);
+		}
+		*licpp = licp->lic_next;
+		kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+		tp->t_items_free -= XFS_LIC_NUM_SLOTS;
+	}
+}
+
+/*
+ * This is called to find the descriptor corresponding to the given
+ * log item.  It returns a pointer to the descriptor.
+ * The log item MUST have a corresponding descriptor in the given
+ * transaction.	 This routine does not return NULL, it panics.
+ *
+ * The descriptor pointer is kept in the log item's li_desc field.
+ * Just return it.
+ */
+/*ARGSUSED*/
+xfs_log_item_desc_t *
+xfs_trans_find_item(xfs_trans_t *tp, xfs_log_item_t *lip)
+{
+	ASSERT(lip->li_desc != NULL);
+
+	return (lip->li_desc);
+}
+
+
+/*
+ * Return a pointer to the first descriptor in the chunk list.
+ * This does not return NULL if there are none, it panics.
+ *
+ * The first descriptor must be in either the first or second chunk.
+ * This is because the only chunk allowed to be empty is the first.
+ * All others are freed when they become empty.
+ *
+ * At some point this and xfs_trans_next_item() should be optimized
+ * to quickly look at the mask to determine if there is anything to
+ * look at.
+ */
+xfs_log_item_desc_t *
+xfs_trans_first_item(xfs_trans_t *tp)
+{
+	xfs_log_item_chunk_t	*licp;
+	int			i;
+
+	licp = &tp->t_items;
+	/*
+	 * If it's not in the first chunk, skip to the second.
+	 */
+	if (XFS_LIC_ARE_ALL_FREE(licp)) {
+		licp = licp->lic_next;
+	}
+
+	/*
+	 * Return the first non-free descriptor in the chunk.
+	 */
+	ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+	for (i = 0; i < licp->lic_unused; i++) {
+		if (XFS_LIC_ISFREE(licp, i)) {
+			continue;
+		}
+
+		return (XFS_LIC_SLOT(licp, i));
+	}
+	cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item");
+	return(NULL);
+}
+
+
+/*
+ * Given a descriptor, return the next descriptor in the chunk list.
+ * This returns NULL if there are no more used descriptors in the list.
+ *
+ * We do this by first locating the chunk in which the descriptor resides,
+ * and then scanning forward in the chunk and the list for the next
+ * used descriptor.
+ */
+/*ARGSUSED*/
+xfs_log_item_desc_t *
+xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
+{
+	xfs_log_item_chunk_t	*licp;
+	int			i;
+
+	licp = XFS_LIC_DESC_TO_CHUNK(lidp);
+
+	/*
+	 * First search the rest of the chunk. The for loop keeps us
+	 * from referencing things beyond the end of the chunk.
+	 */
+	for (i = (int)XFS_LIC_DESC_TO_SLOT(lidp) + 1; i < licp->lic_unused; i++) {
+		if (XFS_LIC_ISFREE(licp, i)) {
+			continue;
+		}
+
+		return (XFS_LIC_SLOT(licp, i));
+	}
+
+	/*
+	 * Now search the next chunk.  It must be there, because the
+	 * next chunk would have been freed if it were empty.
+	 * If there is no next chunk, return NULL.
+	 */
+	if (licp->lic_next == NULL) {
+		return (NULL);
+	}
+
+	licp = licp->lic_next;
+	ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+	for (i = 0; i < licp->lic_unused; i++) {
+		if (XFS_LIC_ISFREE(licp, i)) {
+			continue;
+		}
+
+		return (XFS_LIC_SLOT(licp, i));
+	}
+	ASSERT(0);
+	/* NOTREACHED */
+	return 0; /* keep gcc quite */
+}
+
+/*
+ * This is called to unlock all of the items of a transaction and to free
+ * all the descriptors of that transaction.
+ *
+ * It walks the list of descriptors and unlocks each item.  It frees
+ * each chunk except that embedded in the transaction as it goes along.
+ */
+void
+xfs_trans_free_items(
+	xfs_trans_t	*tp,
+	int		flags)
+{
+	xfs_log_item_chunk_t	*licp;
+	xfs_log_item_chunk_t	*next_licp;
+	int			abort;
+
+	abort = flags & XFS_TRANS_ABORT;
+	licp = &tp->t_items;
+	/*
+	 * Special case the embedded chunk so we don't free it below.
+	 */
+	if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+		(void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
+		XFS_LIC_ALL_FREE(licp);
+		licp->lic_unused = 0;
+	}
+	licp = licp->lic_next;
+
+	/*
+	 * Unlock each item in each chunk and free the chunks.
+	 */
+	while (licp != NULL) {
+		ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+		(void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
+		next_licp = licp->lic_next;
+		kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+		licp = next_licp;
+	}
+
+	/*
+	 * Reset the transaction structure's free item count.
+	 */
+	tp->t_items_free = XFS_LIC_NUM_SLOTS;
+	tp->t_items.lic_next = NULL;
+}
+
+
+
+/*
+ * This is called to unlock the items associated with a transaction.
+ * Items which were not logged should be freed.
+ * Those which were logged must still be tracked so they can be unpinned
+ * when the transaction commits.
+ */
+void
+xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn)
+{
+	xfs_log_item_chunk_t	*licp;
+	xfs_log_item_chunk_t	*next_licp;
+	xfs_log_item_chunk_t	**licpp;
+	int			freed;
+
+	freed = 0;
+	licp = &tp->t_items;
+
+	/*
+	 * Special case the embedded chunk so we don't free.
+	 */
+	if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+		freed = xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
+	}
+	licpp = &(tp->t_items.lic_next);
+	licp = licp->lic_next;
+
+	/*
+	 * Unlock each item in each chunk, free non-dirty descriptors,
+	 * and free empty chunks.
+	 */
+	while (licp != NULL) {
+		ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+		freed += xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
+		next_licp = licp->lic_next;
+		if (XFS_LIC_ARE_ALL_FREE(licp)) {
+			*licpp = next_licp;
+			kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+			freed -= XFS_LIC_NUM_SLOTS;
+		} else {
+			licpp = &(licp->lic_next);
+		}
+		ASSERT(*licpp == next_licp);
+		licp = next_licp;
+	}
+
+	/*
+	 * Fix the free descriptor count in the transaction.
+	 */
+	tp->t_items_free += freed;
+}
+
+/*
+ * Unlock each item pointed to by a descriptor in the given chunk.
+ * Stamp the commit lsn into each item if necessary.
+ * Free descriptors pointing to items which are not dirty if freeing_chunk
+ * is zero. If freeing_chunk is non-zero, then we need to unlock all
+ * items in the chunk including those with XFS_LID_SYNC_UNLOCK set.
+ * Return the number of descriptors freed.
+ */
+STATIC int
+xfs_trans_unlock_chunk(
+	xfs_log_item_chunk_t	*licp,
+	int			freeing_chunk,
+	int			abort,
+	xfs_lsn_t		commit_lsn)
+{
+	xfs_log_item_desc_t	*lidp;
+	xfs_log_item_t		*lip;
+	int			i;
+	int			freed;
+
+	freed = 0;
+	lidp = licp->lic_descs;
+	for (i = 0; i < licp->lic_unused; i++, lidp++) {
+		if (XFS_LIC_ISFREE(licp, i)) {
+			continue;
+		}
+		lip = lidp->lid_item;
+		lip->li_desc = NULL;
+
+		if (commit_lsn != NULLCOMMITLSN)
+			IOP_COMMITTING(lip, commit_lsn);
+
+		/* XXXsup */
+		if (abort)
+			lip->li_flags |= XFS_LI_ABORTED;
+
+		/* if (abort) {
+			IOP_ABORT(lip);
+		} else */
+		if (!(lidp->lid_flags & XFS_LID_SYNC_UNLOCK) ||
+			   freeing_chunk || abort) {
+			IOP_UNLOCK(lip);
+		}
+
+		/*
+		 * Free the descriptor if the item is not dirty
+		 * within this transaction and the caller is not
+		 * going to just free the entire thing regardless.
+		 */
+		if (!(freeing_chunk) &&
+		    (!(lidp->lid_flags & XFS_LID_DIRTY) || abort)) {
+			XFS_LIC_RELSE(licp, i);
+			freed++;
+		}
+	}
+
+	return (freed);
+}
+
+
+/*
+ * This is called to add the given busy item to the transaction's
+ * list of busy items.	It must find a free busy item descriptor
+ * or allocate a new one and add the item to that descriptor.
+ * The function returns a pointer to busy descriptor used to point
+ * to the new busy entry.  The log busy entry will now point to its new
+ * descriptor with its ???? field.
+ */
+xfs_log_busy_slot_t *
+xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx)
+{
+	xfs_log_busy_chunk_t	*lbcp;
+	xfs_log_busy_slot_t	*lbsp;
+	int			i=0;
+
+	/*
+	 * If there are no free descriptors, allocate a new chunk
+	 * of them and put it at the front of the chunk list.
+	 */
+	if (tp->t_busy_free == 0) {
+		lbcp = (xfs_log_busy_chunk_t*)
+		       kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP);
+		ASSERT(lbcp != NULL);
+		/*
+		 * Initialize the chunk, and then
+		 * claim the first slot in the newly allocated chunk.
+		 */
+		XFS_LBC_INIT(lbcp);
+		XFS_LBC_CLAIM(lbcp, 0);
+		lbcp->lbc_unused = 1;
+		lbsp = XFS_LBC_SLOT(lbcp, 0);
+
+		/*
+		 * Link in the new chunk and update the free count.
+		 */
+		lbcp->lbc_next = tp->t_busy.lbc_next;
+		tp->t_busy.lbc_next = lbcp;
+		tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1;
+
+		/*
+		 * Initialize the descriptor and the generic portion
+		 * of the log item.
+		 *
+		 * Point the new slot at this item and return it.
+		 * Also point the log item at its currently active
+		 * descriptor and set the item's mount pointer.
+		 */
+		lbsp->lbc_ag = ag;
+		lbsp->lbc_idx = idx;
+		return (lbsp);
+	}
+
+	/*
+	 * Find the free descriptor. It is somewhere in the chunklist
+	 * of descriptors.
+	 */
+	lbcp = &tp->t_busy;
+	while (lbcp != NULL) {
+		if (XFS_LBC_VACANCY(lbcp)) {
+			if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) {
+				i = lbcp->lbc_unused;
+				break;
+			} else {
+				/* out-of-order vacancy */
+				printk("OOO vacancy lbcp 0x%p\n", lbcp);
+				ASSERT(0);
+			}
+		}
+		lbcp = lbcp->lbc_next;
+	}
+	ASSERT(lbcp != NULL);
+	/*
+	 * If we find a free descriptor, claim it,
+	 * initialize it, and return it.
+	 */
+	XFS_LBC_CLAIM(lbcp, i);
+	if (lbcp->lbc_unused <= i) {
+		lbcp->lbc_unused = i + 1;
+	}
+	lbsp = XFS_LBC_SLOT(lbcp, i);
+	tp->t_busy_free--;
+	lbsp->lbc_ag = ag;
+	lbsp->lbc_idx = idx;
+	return (lbsp);
+}
+
+
+/*
+ * xfs_trans_free_busy
+ * Free all of the busy lists from a transaction
+ */
+void
+xfs_trans_free_busy(xfs_trans_t *tp)
+{
+	xfs_log_busy_chunk_t	*lbcp;
+	xfs_log_busy_chunk_t	*lbcq;
+
+	lbcp = tp->t_busy.lbc_next;
+	while (lbcp != NULL) {
+		lbcq = lbcp->lbc_next;
+		kmem_free(lbcp, sizeof(xfs_log_busy_chunk_t));
+		lbcp = lbcq;
+	}
+
+	XFS_LBC_INIT(&tp->t_busy);
+	tp->t_busy.lbc_unused = 0;
+}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
new file mode 100644
index 000000000000..08c8b16667b8
--- /dev/null
+++ b/fs/xfs/xfs_trans_priv.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2000, 2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_TRANS_PRIV_H__
+#define __XFS_TRANS_PRIV_H__
+
+struct xfs_log_item;
+struct xfs_log_item_desc;
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * From xfs_trans_item.c
+ */
+struct xfs_log_item_desc	*xfs_trans_add_item(struct xfs_trans *,
+					    struct xfs_log_item *);
+void				xfs_trans_free_item(struct xfs_trans *,
+					    struct xfs_log_item_desc *);
+struct xfs_log_item_desc	*xfs_trans_find_item(struct xfs_trans *,
+					     struct xfs_log_item *);
+struct xfs_log_item_desc	*xfs_trans_first_item(struct xfs_trans *);
+struct xfs_log_item_desc	*xfs_trans_next_item(struct xfs_trans *,
+					     struct xfs_log_item_desc *);
+void				xfs_trans_free_items(struct xfs_trans *, int);
+void				xfs_trans_unlock_items(struct xfs_trans *,
+							xfs_lsn_t);
+void				xfs_trans_free_busy(xfs_trans_t *tp);
+xfs_log_busy_slot_t		*xfs_trans_add_busy(xfs_trans_t *tp,
+						    xfs_agnumber_t ag,
+						    xfs_extlen_t idx);
+
+/*
+ * From xfs_trans_ail.c
+ */
+void			xfs_trans_update_ail(struct xfs_mount *,
+				     struct xfs_log_item *, xfs_lsn_t,
+				     unsigned long);
+void			xfs_trans_delete_ail(struct xfs_mount *,
+				     struct xfs_log_item *, unsigned long);
+struct xfs_log_item	*xfs_trans_first_ail(struct xfs_mount *, int *);
+struct xfs_log_item	*xfs_trans_next_ail(struct xfs_mount *,
+				     struct xfs_log_item *, int *, int *);
+
+
+#endif	/* __XFS_TRANS_PRIV_H__ */
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
new file mode 100644
index 000000000000..1dec9dc10ef9
--- /dev/null
+++ b/fs/xfs/xfs_trans_space.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_TRANS_SPACE_H__
+#define __XFS_TRANS_SPACE_H__
+
+/*
+ * Components of space reservations.
+ */
+#define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)	\
+		(((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0]))
+#define XFS_EXTENTADD_SPACE_RES(mp,w)	(XFS_BM_MAXLEVELS(mp,w) - 1)
+#define XFS_NEXTENTADD_SPACE_RES(mp,b,w)\
+	(((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
+	  XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
+	  XFS_EXTENTADD_SPACE_RES(mp,w))
+#define XFS_DAENTER_1B(mp,w)	((w) == XFS_DATA_FORK ? (mp)->m_dirblkfsbs : 1)
+#define XFS_DAENTER_DBS(mp,w)	\
+	(XFS_DA_NODE_MAXDEPTH + \
+	 ((XFS_DIR_IS_V2(mp) && (w) == XFS_DATA_FORK) ? 2 : 0))
+#define XFS_DAENTER_BLOCKS(mp,w)	\
+	(XFS_DAENTER_1B(mp,w) * XFS_DAENTER_DBS(mp,w))
+#define XFS_DAENTER_BMAP1B(mp,w)	\
+	XFS_NEXTENTADD_SPACE_RES(mp, XFS_DAENTER_1B(mp, w), w)
+#define XFS_DAENTER_BMAPS(mp,w)		\
+	(XFS_DAENTER_DBS(mp,w) * XFS_DAENTER_BMAP1B(mp,w))
+#define XFS_DAENTER_SPACE_RES(mp,w)	\
+	(XFS_DAENTER_BLOCKS(mp,w) + XFS_DAENTER_BMAPS(mp,w))
+#define XFS_DAREMOVE_SPACE_RES(mp,w)	XFS_DAENTER_BMAPS(mp,w)
+#define XFS_DIRENTER_MAX_SPLIT(mp,nl)	\
+	(((mp)->m_sb.sb_blocksize == 512 && \
+	  XFS_DIR_IS_V1(mp) && \
+	  (nl) >= XFS_DIR_LEAF_CAN_DOUBLE_SPLIT_LEN) ? 2 : 1)
+#define XFS_DIRENTER_SPACE_RES(mp,nl)	\
+	(XFS_DAENTER_SPACE_RES(mp, XFS_DATA_FORK) * \
+	 XFS_DIRENTER_MAX_SPLIT(mp,nl))
+#define XFS_DIRREMOVE_SPACE_RES(mp)	\
+	XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
+#define XFS_IALLOC_SPACE_RES(mp)	\
+	(XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp)-1)
+
+/*
+ * Space reservation values for various transactions.
+ */
+#define XFS_ADDAFORK_SPACE_RES(mp)	\
+	((mp)->m_dirblkfsbs + \
+	 (XFS_DIR_IS_V1(mp) ? 0 : XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK)))
+#define XFS_ATTRRM_SPACE_RES(mp)	\
+	XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK)
+/* This macro is not used - see inline code in xfs_attr_set */
+#define XFS_ATTRSET_SPACE_RES(mp, v)	\
+	(XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK) + XFS_B_TO_FSB(mp, v))
+#define XFS_CREATE_SPACE_RES(mp,nl)	\
+	(XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define XFS_DIOSTRAT_SPACE_RES(mp, v)	\
+	(XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v))
+#define XFS_GROWFS_SPACE_RES(mp)	\
+	(2 * XFS_AG_MAXLEVELS(mp))
+#define XFS_GROWFSRT_SPACE_RES(mp,b)	\
+	((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK))
+#define XFS_LINK_SPACE_RES(mp,nl)	\
+	XFS_DIRENTER_SPACE_RES(mp,nl)
+#define XFS_MKDIR_SPACE_RES(mp,nl)	\
+	(XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define XFS_QM_DQALLOC_SPACE_RES(mp)	\
+	(XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + \
+	 XFS_DQUOT_CLUSTER_SIZE_FSB)
+#define XFS_QM_QINOCREATE_SPACE_RES(mp) \
+	XFS_IALLOC_SPACE_RES(mp)
+#define XFS_REMOVE_SPACE_RES(mp)	\
+	XFS_DIRREMOVE_SPACE_RES(mp)
+#define XFS_RENAME_SPACE_RES(mp,nl)	\
+	(XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
+#define XFS_SYMLINK_SPACE_RES(mp,nl,b)	\
+	(XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
+
+#endif	/* __XFS_TRANS_SPACE_H__ */
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
new file mode 100644
index 000000000000..aa6fe38dde5e
--- /dev/null
+++ b/fs/xfs/xfs_types.h
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_TYPES_H__
+#define __XFS_TYPES_H__
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+
+/*
+ * POSIX Extensions
+ */
+typedef unsigned char		uchar_t;
+typedef unsigned short		ushort_t;
+typedef unsigned int		uint_t;
+typedef unsigned long		ulong_t;
+
+/*
+ * Additional type declarations for XFS
+ */
+typedef signed char		__int8_t;
+typedef unsigned char		__uint8_t;
+typedef signed short int	__int16_t;
+typedef unsigned short int	__uint16_t;
+typedef signed int		__int32_t;
+typedef unsigned int		__uint32_t;
+typedef signed long long int	__int64_t;
+typedef unsigned long long int	__uint64_t;
+
+typedef enum { B_FALSE, B_TRUE } boolean_t;
+
+
+typedef __int64_t		prid_t;		/* project ID */
+typedef __uint32_t		inst_t;		/* an instruction */
+
+typedef __u64			xfs_off_t;
+typedef __u64			xfs_ino_t;	/* <inode> type */
+typedef __s64			xfs_daddr_t;	/* <disk address> type */
+typedef char *			xfs_caddr_t;	/* <core address> type */
+typedef __u32			xfs_dev_t;
+
+typedef struct timespec		timespec_t;
+
+typedef struct {
+	unsigned char	__u_bits[16];
+} uuid_t;
+
+/* __psint_t is the same size as a pointer */
+#if (BITS_PER_LONG == 32)
+typedef __int32_t __psint_t;
+typedef __uint32_t __psunsigned_t;
+#elif (BITS_PER_LONG == 64)
+typedef __int64_t __psint_t;
+typedef __uint64_t __psunsigned_t;
+#else
+#error BITS_PER_LONG must be 32 or 64
+#endif
+
+#endif	/* __KERNEL__ */
+
+/*
+ * Some types are conditional based on the selected configuration.
+ * Set XFS_BIG_FILESYSTEMS=1 or 0 depending on the desired configuration.
+ * XFS_BIG_FILESYSTEMS needs daddr_t to be 64 bits
+ *
+ * On linux right now we are limited to 2^32 512 byte blocks in a
+ * filesystem, Once this limit is changed, setting this to 1
+ * will allow XFS to go larger. With BIG_FILESYSTEMS set to 0
+ * a 4K block filesystem could still theoretically be 16Gbytes
+ * long, so on an ia32 box the 32 bit page index will then be
+ * the limiting factor.
+ */
+
+#ifndef XFS_BIG_FILESYSTEMS
+#define XFS_BIG_FILESYSTEMS	0
+#endif
+
+typedef __uint32_t	xfs_agblock_t;	/* blockno in alloc. group */
+typedef __uint32_t	xfs_extlen_t;	/* extent length in blocks */
+typedef __uint32_t	xfs_agnumber_t; /* allocation group number */
+typedef __int32_t	xfs_extnum_t;	/* # of extents in a file */
+typedef __int16_t	xfs_aextnum_t;	/* # extents in an attribute fork */
+typedef __int64_t	xfs_fsize_t;	/* bytes in a file */
+typedef __uint64_t	xfs_ufsize_t;	/* unsigned bytes in a file */
+
+typedef __int32_t	xfs_suminfo_t;	/* type of bitmap summary info */
+typedef __int32_t	xfs_rtword_t;	/* word type for bitmap manipulations */
+
+typedef __int64_t	xfs_lsn_t;	/* log sequence number */
+typedef __int32_t	xfs_tid_t;	/* transaction identifier */
+
+typedef __uint32_t	xfs_dablk_t;	/* dir/attr block number (in file) */
+typedef __uint32_t	xfs_dahash_t;	/* dir/attr hash value */
+
+typedef __uint16_t	xfs_prid_t;	/* prid_t truncated to 16bits in XFS */
+
+/*
+ * These types are 64 bits on disk but are either 32 or 64 bits in memory.
+ * Disk based types:
+ */
+typedef __uint64_t	xfs_dfsbno_t;	/* blockno in filesystem (agno|agbno) */
+typedef __uint64_t	xfs_drfsbno_t;	/* blockno in filesystem (raw) */
+typedef __uint64_t	xfs_drtbno_t;	/* extent (block) in realtime area */
+typedef __uint64_t	xfs_dfiloff_t;	/* block number in a file */
+typedef __uint64_t	xfs_dfilblks_t; /* number of blocks in a file */
+
+/*
+ * Memory based types are conditional.
+ */
+#if XFS_BIG_FILESYSTEMS
+typedef __uint64_t	xfs_fsblock_t;	/* blockno in filesystem (agno|agbno) */
+typedef __uint64_t	xfs_rfsblock_t; /* blockno in filesystem (raw) */
+typedef __uint64_t	xfs_rtblock_t;	/* extent (block) in realtime area */
+typedef __int64_t	xfs_srtblock_t; /* signed version of xfs_rtblock_t */
+#else
+typedef __uint32_t	xfs_fsblock_t;	/* blockno in filesystem (agno|agbno) */
+typedef __uint32_t	xfs_rfsblock_t; /* blockno in filesystem (raw) */
+typedef __uint32_t	xfs_rtblock_t;	/* extent (block) in realtime area */
+typedef __int32_t	xfs_srtblock_t; /* signed version of xfs_rtblock_t */
+#endif
+typedef __uint64_t	xfs_fileoff_t;	/* block number in a file */
+typedef __int64_t	xfs_sfiloff_t;	/* signed block number in a file */
+typedef __uint64_t	xfs_filblks_t;	/* number of blocks in a file */
+
+typedef __uint8_t	xfs_arch_t;	/* architecutre of an xfs fs */
+
+/*
+ * Null values for the types.
+ */
+#define NULLDFSBNO	((xfs_dfsbno_t)-1)
+#define NULLDRFSBNO	((xfs_drfsbno_t)-1)
+#define NULLDRTBNO	((xfs_drtbno_t)-1)
+#define NULLDFILOFF	((xfs_dfiloff_t)-1)
+
+#define NULLFSBLOCK	((xfs_fsblock_t)-1)
+#define NULLRFSBLOCK	((xfs_rfsblock_t)-1)
+#define NULLRTBLOCK	((xfs_rtblock_t)-1)
+#define NULLFILEOFF	((xfs_fileoff_t)-1)
+
+#define NULLAGBLOCK	((xfs_agblock_t)-1)
+#define NULLAGNUMBER	((xfs_agnumber_t)-1)
+#define NULLEXTNUM	((xfs_extnum_t)-1)
+
+#define NULLCOMMITLSN	((xfs_lsn_t)-1)
+
+/*
+ * Max values for extlen, extnum, aextnum.
+ */
+#define MAXEXTLEN	((xfs_extlen_t)0x001fffff)	/* 21 bits */
+#define MAXEXTNUM	((xfs_extnum_t)0x7fffffff)	/* signed int */
+#define MAXAEXTNUM	((xfs_aextnum_t)0x7fff)		/* signed short */
+
+/*
+ * MAXNAMELEN is the length (including the terminating null) of
+ * the longest permissible file (component) name.
+ */
+#define MAXNAMELEN	256
+
+typedef enum {
+	XFS_LOOKUP_EQi, XFS_LOOKUP_LEi, XFS_LOOKUP_GEi
+} xfs_lookup_t;
+
+typedef enum {
+	XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi,
+	XFS_BTNUM_MAX
+} xfs_btnum_t;
+
+
+#if defined(CONFIG_PROC_FS) && defined(__KERNEL__) && !defined(XFS_STATS_OFF)
+/*
+ * XFS global statistics
+ */
+struct xfsstats {
+# define XFSSTAT_END_EXTENT_ALLOC	4
+	__uint32_t		xs_allocx;
+	__uint32_t		xs_allocb;
+	__uint32_t		xs_freex;
+	__uint32_t		xs_freeb;
+# define XFSSTAT_END_ALLOC_BTREE	(XFSSTAT_END_EXTENT_ALLOC+4)
+	__uint32_t		xs_abt_lookup;
+	__uint32_t		xs_abt_compare;
+	__uint32_t		xs_abt_insrec;
+	__uint32_t		xs_abt_delrec;
+# define XFSSTAT_END_BLOCK_MAPPING	(XFSSTAT_END_ALLOC_BTREE+7)
+	__uint32_t		xs_blk_mapr;
+	__uint32_t		xs_blk_mapw;
+	__uint32_t		xs_blk_unmap;
+	__uint32_t		xs_add_exlist;
+	__uint32_t		xs_del_exlist;
+	__uint32_t		xs_look_exlist;
+	__uint32_t		xs_cmp_exlist;
+# define XFSSTAT_END_BLOCK_MAP_BTREE	(XFSSTAT_END_BLOCK_MAPPING+4)
+	__uint32_t		xs_bmbt_lookup;
+	__uint32_t		xs_bmbt_compare;
+	__uint32_t		xs_bmbt_insrec;
+	__uint32_t		xs_bmbt_delrec;
+# define XFSSTAT_END_DIRECTORY_OPS	(XFSSTAT_END_BLOCK_MAP_BTREE+4)
+	__uint32_t		xs_dir_lookup;
+	__uint32_t		xs_dir_create;
+	__uint32_t		xs_dir_remove;
+	__uint32_t		xs_dir_getdents;
+# define XFSSTAT_END_TRANSACTIONS	(XFSSTAT_END_DIRECTORY_OPS+3)
+	__uint32_t		xs_trans_sync;
+	__uint32_t		xs_trans_async;
+	__uint32_t		xs_trans_empty;
+# define XFSSTAT_END_INODE_OPS		(XFSSTAT_END_TRANSACTIONS+7)
+	__uint32_t		xs_ig_attempts;
+	__uint32_t		xs_ig_found;
+	__uint32_t		xs_ig_frecycle;
+	__uint32_t		xs_ig_missed;
+	__uint32_t		xs_ig_dup;
+	__uint32_t		xs_ig_reclaims;
+	__uint32_t		xs_ig_attrchg;
+# define XFSSTAT_END_LOG_OPS		(XFSSTAT_END_INODE_OPS+5)
+	__uint32_t		xs_log_writes;
+	__uint32_t		xs_log_blocks;
+	__uint32_t		xs_log_noiclogs;
+	__uint32_t		xs_log_force;
+	__uint32_t		xs_log_force_sleep;
+# define XFSSTAT_END_TAIL_PUSHING	(XFSSTAT_END_LOG_OPS+10)
+	__uint32_t		xs_try_logspace;
+	__uint32_t		xs_sleep_logspace;
+	__uint32_t		xs_push_ail;
+	__uint32_t		xs_push_ail_success;
+	__uint32_t		xs_push_ail_pushbuf;
+	__uint32_t		xs_push_ail_pinned;
+	__uint32_t		xs_push_ail_locked;
+	__uint32_t		xs_push_ail_flushing;
+	__uint32_t		xs_push_ail_restarts;
+	__uint32_t		xs_push_ail_flush;
+# define XFSSTAT_END_WRITE_CONVERT	(XFSSTAT_END_TAIL_PUSHING+2)
+	__uint32_t		xs_xstrat_quick;
+	__uint32_t		xs_xstrat_split;
+# define XFSSTAT_END_READ_WRITE_OPS	(XFSSTAT_END_WRITE_CONVERT+2)
+	__uint32_t		xs_write_calls;
+	__uint32_t		xs_read_calls;
+# define XFSSTAT_END_ATTRIBUTE_OPS	(XFSSTAT_END_READ_WRITE_OPS+4)
+	__uint32_t		xs_attr_get;
+	__uint32_t		xs_attr_set;
+	__uint32_t		xs_attr_remove;
+	__uint32_t		xs_attr_list;
+# define XFSSTAT_END_QUOTA_OPS		(XFSSTAT_END_ATTRIBUTE_OPS+8)
+	__uint32_t		xs_qm_dqreclaims;
+	__uint32_t		xs_qm_dqreclaim_misses;
+	__uint32_t		xs_qm_dquot_dups;
+	__uint32_t		xs_qm_dqcachemisses;
+	__uint32_t		xs_qm_dqcachehits;
+	__uint32_t		xs_qm_dqwants;
+	__uint32_t		xs_qm_dqshake_reclaims;
+	__uint32_t		xs_qm_dqinact_reclaims;
+# define XFSSTAT_END_INODE_CLUSTER	(XFSSTAT_END_QUOTA_OPS+3)
+	__uint32_t		xs_iflush_count;
+	__uint32_t		xs_icluster_flushcnt;
+	__uint32_t		xs_icluster_flushinode;
+# define XFSSTAT_END_VNODE_OPS		(XFSSTAT_END_INODE_CLUSTER+8)
+	__uint32_t		vn_active;	/* # vnodes not on free lists */
+	__uint32_t		vn_alloc;	/* # times vn_alloc called */
+	__uint32_t		vn_get;		/* # times vn_get called */
+	__uint32_t		vn_hold;	/* # times vn_hold called */
+	__uint32_t		vn_rele;	/* # times vn_rele called */
+	__uint32_t		vn_reclaim;	/* # times vn_reclaim called */
+	__uint32_t		vn_remove;	/* # times vn_remove called */
+	__uint32_t		vn_free;	/* # times vn_free called */
+/* Extra precision counters */
+	__uint64_t		xs_xstrat_bytes;
+	__uint64_t		xs_write_bytes;
+	__uint64_t		xs_read_bytes;
+};
+
+extern struct xfsstats xfsstats;
+
+# define XFS_STATS_INC(count)		( (count)++ )
+# define XFS_STATS_DEC(count)		( (count)-- )
+# define XFS_STATS_ADD(count, inc)	( (count) += (inc) )
+#else	/* !CONFIG_PROC_FS */
+# define XFS_STATS_INC(count)
+# define XFS_STATS_DEC(count)
+# define XFS_STATS_ADD(count, inc)
+#endif	/* !CONFIG_PROC_FS */
+
+
+
+/* juggle IRIX device numbers - still used in ondisk structures */
+
+#ifndef __KERNEL__
+#define MKDEV(major, minor)	makedev(major, minor)
+#endif
+
+#define IRIX_DEV_BITSMAJOR	14
+#define IRIX_DEV_BITSMINOR	18
+#define IRIX_DEV_MAXMAJ		0x1ff
+#define IRIX_DEV_MAXMIN		0x3ffff
+#define IRIX_DEV_MAJOR(dev)	((int)(((unsigned)(dev)>>IRIX_DEV_BITSMINOR) \
+				    & IRIX_DEV_MAXMAJ))
+#define IRIX_DEV_MINOR(dev)	((int)((dev)&IRIX_DEV_MAXMIN))
+#define IRIX_MKDEV(major,minor) ((xfs_dev_t)(((major)<<IRIX_DEV_BITSMINOR) \
+				    | (minor&IRIX_DEV_MAXMIN)))
+
+#define IRIX_DEV_TO_KDEVT(dev)	MKDEV(IRIX_DEV_MAJOR(dev),IRIX_DEV_MINOR(dev))
+
+#endif	/* !__XFS_TYPES_H */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
new file mode 100644
index 000000000000..e87b2a9d04db
--- /dev/null
+++ b/fs/xfs/xfs_utils.c
@@ -0,0 +1,519 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+#ifdef CONFIG_PROC_FS
+struct xfsstats xfsstats;
+#endif
+
+/*
+ * xfs_get_dir_entry is used to get a reference to an inode given
+ * its parent directory inode and the name of the file.	 It does
+ * not lock the child inode, and it unlocks the directory before
+ * returning.  The directory's generation number is returned for
+ * use by a later call to xfs_lock_dir_and_entry.
+ */
+int
+xfs_get_dir_entry(
+	struct dentry		*dentry,
+	xfs_inode_t		**ipp)
+{
+	vnode_t			*vp;
+	bhv_desc_t		*bdp;
+
+	ASSERT(dentry->d_inode);
+
+	vp = LINVFS_GET_VP(dentry->d_inode);
+	bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(vp), &xfs_vnodeops);
+	if (!bdp) {
+		*ipp = NULL;
+		return XFS_ERROR(ENOENT);
+	}
+	VN_HOLD(vp);
+	*ipp = XFS_BHVTOI(bdp);
+	return 0;
+}
+
+/*
+ * Wrapper around xfs_dir_lookup.
+ *
+ * If DLF_IGET is set, then this routine will also return the inode.
+ * Note that the inode will not be locked. Note, however, that the
+ * vnode will have an additional reference in this case.
+ */
+int
+xfs_dir_lookup_int(
+	bhv_desc_t		*dir_bdp,
+	int			flags,
+	struct dentry		*dentry,
+	xfs_ino_t		*inum,
+	xfs_inode_t		**ipp)
+{
+	vnode_t		*dir_vp;
+	xfs_inode_t	*dp;
+	char		*name = (char *) dentry->d_name.name;
+	int		name_len = dentry->d_name.len;
+	int		error;
+	int		do_iget;
+	uint		lock_mode;
+	bhv_desc_t	*bdp;
+
+	dir_vp = BHV_TO_VNODE(dir_bdp);
+	vn_trace_entry(dir_vp, "xfs_dir_lookup_int",
+		       (inst_t *)__return_address);
+
+	do_iget = flags & DLF_IGET;
+	error = 0;
+
+	if (flags & DLF_LOCK_SHARED) {
+		lock_mode = XFS_ILOCK_SHARED;
+	} else {
+		lock_mode = XFS_ILOCK_EXCL;
+	}
+
+	dp = XFS_BHVTOI(dir_bdp);
+	bdp = NULL;
+
+	/*
+	 * If all else fails, call the directory code.
+	 */
+
+	error = XFS_DIR_LOOKUP(dp->i_mount, NULL, dp, name, name_len, inum);
+	if (!error && do_iget) {
+		/*
+		 * Unlock the directory. We do this because we can't
+		 * hold the directory lock while doing the vn_get()
+		 * in xfs_iget().  Doing so could cause us to hold
+		 * a lock while waiting for the inode to finish
+		 * being inactive while it's waiting for a log
+		 * reservation in the inactive routine.
+		 */
+		xfs_iunlock(dp, lock_mode);
+
+		if (bdp) {
+			VN_RELE(BHV_TO_VNODE(bdp));
+			bdp = NULL;
+		}
+
+		error = xfs_iget(dp->i_mount, NULL, *inum, 0, ipp, 0);
+
+		xfs_ilock(dp, lock_mode);
+
+		if (error) {
+			*ipp = NULL;
+			return error;
+		}
+
+		if ((*ipp)->i_d.di_mode == 0) {
+			/*
+			 * The inode has been freed.  Something is
+			 * wrong so just get out of here.
+			 */
+			xfs_iunlock(dp, lock_mode);
+			xfs_iput_new(*ipp, 0);
+			*ipp = NULL;
+			xfs_ilock(dp, lock_mode);
+			error = XFS_ERROR(ENOENT);
+		} else {
+			bdp = XFS_ITOBHV(*ipp);
+			bdp = NULL;
+		}
+	}
+	if (bdp) {
+		/* The only time we should get here is if the dir_lookup
+		 * failed.
+		 */
+		ASSERT(error);
+		xfs_iunlock(dp, lock_mode);
+		VN_RELE(BHV_TO_VNODE(bdp));
+		xfs_ilock(dp, lock_mode);
+	}
+	return error;
+}
+
+/*
+ * Allocates a new inode from disk and return a pointer to the
+ * incore copy. This routine will internally commit the current
+ * transaction and allocate a new one if the Space Manager needed
+ * to do an allocation to replenish the inode free-list.
+ *
+ * This routine is designed to be called from xfs_create and
+ * xfs_create_dir.
+ *
+ */
+int
+xfs_dir_ialloc(
+	xfs_trans_t	**tpp,		/* input: current transaction;
+					   output: may be a new transaction. */
+	xfs_inode_t	*dp,		/* directory within whose allocate
+					   the inode. */
+	mode_t		mode,
+	nlink_t		nlink,
+	dev_t		rdev,
+	cred_t		*credp,
+	prid_t		prid,		/* project id */
+	int		okalloc,	/* ok to allocate new space */
+	xfs_inode_t	**ipp,		/* pointer to inode; it will be
+					   locked. */
+	int		*committed)
+
+{
+	xfs_trans_t	*tp;
+	xfs_trans_t	*ntp;
+	xfs_inode_t	*ip;
+	xfs_buf_t	*ialloc_context = NULL;
+	boolean_t	call_again = B_FALSE;
+	int		code;
+	uint		log_res;
+	uint		log_count;
+	void		*dqinfo;
+	uint		tflags;
+
+	tp = *tpp;
+	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+	/*
+	 * xfs_ialloc will return a pointer to an incore inode if
+	 * the Space Manager has an available inode on the free
+	 * list. Otherwise, it will do an allocation and replenish
+	 * the freelist.  Since we can only do one allocation per
+	 * transaction without deadlocks, we will need to commit the
+	 * current transaction and start a new one.  We will then
+	 * need to call xfs_ialloc again to get the inode.
+	 *
+	 * If xfs_ialloc did an allocation to replenish the freelist,
+	 * it returns the bp containing the head of the freelist as
+	 * ialloc_context. We will hold a lock on it across the
+	 * transaction commit so that no other process can steal
+	 * the inode(s) that we've just allocated.
+	 */
+	code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, okalloc,
+			  &ialloc_context, &call_again, &ip);
+
+	/*
+	 * Return an error if we were unable to allocate a new inode.
+	 * This should only happen if we run out of space on disk or
+	 * encounter a disk error.
+	 */
+	if (code) {
+		*ipp = NULL;
+		return code;
+	}
+	if (!call_again && (ip == NULL)) {
+		*ipp = NULL;
+		return XFS_ERROR(ENOSPC);
+	}
+
+	/*
+	 * If call_again is set, then we were unable to get an
+	 * inode in one operation.  We need to commit the current
+	 * transaction and call xfs_ialloc() again.  It is guaranteed
+	 * to succeed the second time.
+	 */
+	if (call_again) {
+
+		/*
+		 * Normally, xfs_trans_commit releases all the locks.
+		 * We call bhold to hang on to the ialloc_context across
+		 * the commit.	Holding this buffer prevents any other
+		 * processes from doing any allocations in this
+		 * allocation group.
+		 */
+		xfs_trans_bhold(tp, ialloc_context);
+		/*
+		 * Save the log reservation so we can use
+		 * them in the next transaction.
+		 */
+		log_res = xfs_trans_get_log_res(tp);
+		log_count = xfs_trans_get_log_count(tp);
+
+		/*
+		 * We want the quota changes to be associated with the next
+		 * transaction, NOT this one. So, detach the dqinfo from this
+		 * and attach it to the next transaction.
+		 */
+		dqinfo = NULL;
+		tflags = 0;
+		if (tp->t_dqinfo) {
+			dqinfo = (void *)tp->t_dqinfo;
+			tp->t_dqinfo = NULL;
+			tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
+			tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
+		}
+
+		ntp = xfs_trans_dup(tp);
+		code = xfs_trans_commit(tp, 0, NULL);
+		tp = ntp;
+		if (committed != NULL) {
+			*committed = 1;
+		}
+		/*
+		 * If we get an error during the commit processing,
+		 * release the buffer that is still held and return
+		 * to the caller.
+		 */
+		if (code) {
+			xfs_buf_relse(ialloc_context);
+			if (dqinfo) {
+				tp->t_dqinfo = dqinfo;
+				xfs_trans_free_dqinfo(tp);
+			}
+			*tpp = ntp;
+			*ipp = NULL;
+			return code;
+		}
+		code = xfs_trans_reserve(tp, 0, log_res, 0,
+					 XFS_TRANS_PERM_LOG_RES, log_count);
+		/*
+		 * Re-attach the quota info that we detached from prev trx.
+		 */
+		if (dqinfo) {
+			tp->t_dqinfo = dqinfo;
+			tp->t_flags |= tflags;
+		}
+
+		if (code) {
+			xfs_buf_relse(ialloc_context);
+			*tpp = ntp;
+			*ipp = NULL;
+			return code;
+		}
+		xfs_trans_bjoin (tp, ialloc_context);
+
+		/*
+		 * Call ialloc again. Since we've locked out all
+		 * other allocations in this allocation group,
+		 * this call should always succeed.
+		 */
+		code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid,
+				  okalloc, &ialloc_context, &call_again, &ip);
+
+		/*
+		 * If we get an error at this point, return to the caller
+		 * so that the current transaction can be aborted.
+		 */
+		if (code) {
+			*tpp = tp;
+			*ipp = NULL;
+			return code;
+		}
+		ASSERT ((!call_again) && (ip != NULL));
+
+	} else {
+		if (committed != NULL) {
+			*committed = 0;
+		}
+	}
+
+	*ipp = ip;
+	*tpp = tp;
+
+	return 0;
+}
+
+/*
+ * Decrement the link count on an inode & log the change.
+ * If this causes the link count to go to zero, initiate the
+ * logging activity required to truncate a file.
+ */
+int				/* error */
+xfs_droplink(
+	xfs_trans_t *tp,
+	xfs_inode_t *ip)
+{
+	int	error;
+
+	xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+
+	ASSERT (ip->i_d.di_nlink > 0);
+	ip->i_d.di_nlink--;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	error = 0;
+	if (ip->i_d.di_nlink == 0) {
+		/*
+		 * We're dropping the last link to this file.
+		 * Move the on-disk inode to the AGI unlinked list.
+		 * From xfs_inactive() we will pull the inode from
+		 * the list and free it.
+		 */
+		error = xfs_iunlink(tp, ip);
+	}
+	return error;
+}
+
+/*
+ * This gets called when the inode's version needs to be changed from 1 to 2.
+ * Currently this happens when the nlink field overflows the old 16-bit value
+ * or when chproj is called to change the project for the first time.
+ * As a side effect the superblock version will also get rev'd
+ * to contain the NLINK bit.
+ */
+void
+xfs_bump_ino_vers2(
+	xfs_trans_t	*tp,
+	xfs_inode_t	*ip)
+{
+	xfs_mount_t	*mp;
+	unsigned long		s;
+
+	ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
+	ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1);
+
+	ip->i_d.di_version = XFS_DINODE_VERSION_2;
+	ip->i_d.di_onlink = 0;
+	bzero(&(ip->i_d.di_pad[0]), sizeof(ip->i_d.di_pad));
+	mp = tp->t_mountp;
+	if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
+		s = XFS_SB_LOCK(mp);
+		if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
+			XFS_SB_VERSION_ADDNLINK(&mp->m_sb);
+			XFS_SB_UNLOCK(mp, s);
+			xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
+		} else {
+			XFS_SB_UNLOCK(mp, s);
+		}
+	}
+	/* Caller must log the inode */
+}
+
+/*
+ * Increment the link count on an inode & log the change.
+ */
+int
+xfs_bumplink(
+	xfs_trans_t *tp,
+	xfs_inode_t *ip)
+{
+	if (ip->i_d.di_nlink >= XFS_MAXLINK)
+		return XFS_ERROR(EMLINK);
+	xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+
+	ASSERT(ip->i_d.di_nlink > 0);
+	ip->i_d.di_nlink++;
+	if ((ip->i_d.di_version == XFS_DINODE_VERSION_1) &&
+	    (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
+		/*
+		 * The inode has increased its number of links beyond
+		 * what can fit in an old format inode.	 It now needs
+		 * to be converted to a version 2 inode with a 32 bit
+		 * link count.	If this is the first inode in the file
+		 * system to do this, then we need to bump the superblock
+		 * version number as well.
+		 */
+		xfs_bump_ino_vers2(tp, ip);
+	}
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	return 0;
+}
+
+/*
+ * Try to truncate the given file to 0 length.	Currently called
+ * only out of xfs_remove when it has to truncate a file to free
+ * up space for the remove to proceed.
+ */
+int
+xfs_truncate_file(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*ip)
+{
+	xfs_trans_t	*tp;
+	int		error;
+
+#ifdef QUOTADEBUG
+	/*
+	 * This is called to truncate the quotainodes too.
+	 */
+	if (XFS_IS_UQUOTA_ON(mp)) {
+		if (ip->i_ino != mp->m_sb.sb_uquotino)
+			ASSERT(ip->i_udquot);
+	}
+	if (XFS_IS_GQUOTA_ON(mp)) {
+		if (ip->i_ino != mp->m_sb.sb_gquotino)
+			ASSERT(ip->i_gdquot);
+	}
+#endif
+	/*
+	 * Make the call to xfs_itruncate_start before starting the
+	 * transaction, because we cannot make the call while we're
+	 * in a transaction.
+	 */
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+	xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, (xfs_fsize_t)0);
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
+	if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+				      XFS_TRANS_PERM_LOG_RES,
+				      XFS_ITRUNCATE_LOG_COUNT))) {
+		xfs_trans_cancel(tp, 0);
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+		return error;
+	}
+
+	/*
+	 * Follow the normal truncate locking protocol.	 Since we
+	 * hold the inode in the transaction, we know that it's number
+	 * of references will stay constant.
+	 */
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_trans_ihold(tp, ip);
+	/*
+	 * Signal a sync xaction.  The only case where that isn't
+	 * the case is if we're truncating an already unlinked file
+	 * on a wsync fs.  In that case, we know the blocks can't
+	 * reappear in the file because the links to file are
+	 * permanently toast.  Currently, we're always going to
+	 * want a sync transaction because this code is being
+	 * called from places where nlink is guaranteed to be 1
+	 * but I'm leaving the tests in to protect against future
+	 * changes -- rcc.
+	 */
+	error = xfs_itruncate_finish(&tp, ip, (xfs_fsize_t)0,
+				     XFS_DATA_FORK,
+				     ((ip->i_d.di_nlink != 0 ||
+				       !(mp->m_flags & XFS_MOUNT_WSYNC))
+				      ? 1 : 0));
+	if (error) {
+		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
+				 XFS_TRANS_ABORT);
+	} else {
+		xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES,
+					 NULL);
+	}
+	xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+	return error;
+}
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
new file mode 100644
index 000000000000..b92d2894cf82
--- /dev/null
+++ b/fs/xfs/xfs_utils.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_UTILS_H__
+#define __XFS_UTILS_H__
+
+#define IRELE(ip)	VN_RELE(XFS_ITOV(ip))
+#define IHOLD(ip)	VN_HOLD(XFS_ITOV(ip))
+#define ITRACE(ip)	vn_trace_ref(XFS_ITOV(ip), __FILE__, __LINE__, \
+				(inst_t *)__return_address)
+
+#define DLF_IGET	0x01	/* get entry inode if name lookup succeeds */
+#define DLF_LOCK_SHARED 0x02	/* directory locked shared */
+
+struct bhv_desc;
+struct cred;
+struct vnode;
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_trans;
+
+extern int
+xfs_rename(
+	struct bhv_desc *src_dir_bdp,
+	struct dentry	*src_dentry,
+	struct vnode	*target_dir_vp,
+	struct dentry	*target_dentry,
+	struct cred	*credp);
+
+extern int
+xfs_get_dir_entry(
+	struct dentry		*dentry,
+	xfs_inode_t		**ipp);
+
+extern int
+xfs_dir_lookup_int(
+	struct bhv_desc		*dir_bdp,
+	int			flags,
+	struct dentry		*dentry,
+	xfs_ino_t		*inum,
+	struct xfs_inode	**ipp);
+
+extern int
+xfs_truncate_file(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*ip);
+
+extern int
+xfs_dir_ialloc(
+	struct xfs_trans	**tpp,
+	struct xfs_inode	*dp,
+	mode_t			mode,
+	nlink_t			nlink,
+	dev_t			rdev,
+	struct cred		*credp,
+	prid_t			prid,
+	int			okalloc,
+	struct xfs_inode	**ipp,
+	int			*committed);
+
+extern int
+xfs_droplink(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip);
+
+extern int
+xfs_bumplink(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip);
+
+extern void
+xfs_bump_ino_vers2(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip);
+
+#endif /* XFS_UTILS_H */
+
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
new file mode 100644
index 000000000000..083f5725cd1a
--- /dev/null
+++ b/fs/xfs/xfs_vfsops.c
@@ -0,0 +1,1669 @@
+/*
+ * XFS filesystem operations.
+ *
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+
+STATIC int xfs_ibusy(xfs_mount_t *);
+STATIC int xfs_sync(bhv_desc_t *, int, cred_t *);
+STATIC int xfs_unmount(bhv_desc_t *, int, cred_t *);
+
+int
+xfs_init(void)
+{
+	extern kmem_zone_t	*xfs_da_state_zone;
+	extern kmem_zone_t	*xfs_bmap_free_item_zone;
+	extern kmem_zone_t	*xfs_btree_cur_zone;
+	extern kmem_zone_t	*xfs_inode_zone;
+	extern kmem_zone_t	*xfs_chashlist_zone;
+	extern kmem_zone_t	*xfs_trans_zone;
+	extern kmem_zone_t	*xfs_buf_item_zone;
+	extern kmem_zone_t	*xfs_efd_zone;
+	extern kmem_zone_t	*xfs_efi_zone;
+	extern kmem_zone_t	*xfs_dabuf_zone;
+	extern mutex_t		xfs_uuidtabmon;
+#ifdef DEBUG_NOT
+	extern ktrace_t		*xfs_alloc_trace_buf;
+	extern ktrace_t		*xfs_bmap_trace_buf;
+	extern ktrace_t		*xfs_bmbt_trace_buf;
+	extern ktrace_t		*xfs_dir_trace_buf;
+	extern ktrace_t		*xfs_attr_trace_buf;
+	extern ktrace_t		*xfs_dir2_trace_buf;
+#endif	/* DEBUG */
+#ifdef XFS_DABUF_DEBUG
+	extern lock_t		xfs_dabuf_global_lock;
+#endif
+	extern int		xfs_refcache_size;
+
+#ifdef XFS_DABUF_DEBUG
+	spinlock_init(&xfs_dabuf_global_lock, "xfsda");
+#endif
+	mutex_init(&xfs_uuidtabmon, MUTEX_DEFAULT, "xfs_uuidtab");
+	mutex_init(&xfs_Gqm_lock, MUTEX_DEFAULT, "xfs_qmlock");
+
+	/*
+	 * Initialize all of the zone allocators we use.
+	 */
+	xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
+						 "xfs_bmap_free_item");
+	xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
+					    "xfs_btree_cur");
+	xfs_inode_zone = kmem_zone_init(sizeof(xfs_inode_t), "xfs_inode");
+	xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
+	xfs_da_state_zone =
+		kmem_zone_init(sizeof(xfs_da_state_t), "xfs_da_state");
+	xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
+
+	/*
+	 * The size of the zone allocated buf log item is the maximum
+	 * size possible under XFS.  This wastes a little bit of memory,
+	 * but it is much faster.
+	 */
+	xfs_buf_item_zone =
+		kmem_zone_init((sizeof(xfs_buf_log_item_t) +
+				(((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) /
+				  NBWORD) * sizeof(int))),
+			       "xfs_buf_item");
+	xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
+				       ((XFS_EFD_MAX_FAST_EXTENTS - 1) * sizeof(xfs_extent_t))),
+				      "xfs_efd_item");
+	xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) +
+				       ((XFS_EFI_MAX_FAST_EXTENTS - 1) * sizeof(xfs_extent_t))),
+				      "xfs_efi_item");
+	xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
+	xfs_ili_zone = kmem_zone_init(sizeof(xfs_inode_log_item_t), "xfs_ili");
+	xfs_chashlist_zone = kmem_zone_init(sizeof(xfs_chashlist_t),
+					    "xfs_chashlist");
+	_ACL_ZONE_INIT(xfs_acl_zone, "xfs_acl");
+
+#ifdef CONFIG_XFS_VNODE_TRACING
+	ktrace_init(VNODE_TRACE_SIZE);
+#else
+#ifdef DEBUG
+	ktrace_init(64);
+#endif
+#endif
+
+	/*
+	 * Allocate global trace buffers.
+	 */
+#ifdef XFS_ALLOC_TRACE
+	xfs_alloc_trace_buf = ktrace_alloc(XFS_ALLOC_TRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_BMAP_TRACE
+	xfs_bmap_trace_buf = ktrace_alloc(XFS_BMAP_TRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_BMBT_TRACE
+	xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_DIR_TRACE
+	xfs_dir_trace_buf = ktrace_alloc(XFS_DIR_TRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_ATTR_TRACE
+	xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_SLEEP);
+#endif
+#ifdef XFS_DIR2_TRACE
+	xfs_dir2_trace_buf = ktrace_alloc(XFS_DIR2_GTRACE_SIZE, KM_SLEEP);
+#endif
+
+	xfs_dir_startup();
+
+#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
+	xfs_error_test_init();
+#endif /* DEBUG || INDUCE_IO_ERROR */
+
+	xfs_init_procfs();
+	xfs_sysctl_register();
+
+	xfs_refcache_size = xfs_params.refcache_size;
+
+	/*
+	 * The inode hash table is created on a per mounted
+	 * file system bases.
+	 */
+
+	return 0;
+}
+
+void
+xfs_cleanup(void)
+{
+	extern kmem_zone_t	*xfs_bmap_free_item_zone;
+	extern kmem_zone_t	*xfs_btree_cur_zone;
+	extern kmem_zone_t	*xfs_inode_zone;
+	extern kmem_zone_t	*xfs_trans_zone;
+	extern kmem_zone_t	*xfs_da_state_zone;
+	extern kmem_zone_t	*xfs_dabuf_zone;
+	extern kmem_zone_t	*xfs_efd_zone;
+	extern kmem_zone_t	*xfs_efi_zone;
+	extern kmem_zone_t	*xfs_buf_item_zone;
+	extern kmem_zone_t	*xfs_chashlist_zone;
+	extern xfs_inode_t	**xfs_refcache;
+
+	xfs_cleanup_procfs();
+	xfs_sysctl_unregister();
+	if (xfs_refcache) {
+		kmem_free(xfs_refcache,
+			XFS_REFCACHE_SIZE_MAX * sizeof(xfs_inode_t *));
+	}
+
+	kmem_cache_destroy(xfs_bmap_free_item_zone);
+	kmem_cache_destroy(xfs_btree_cur_zone);
+	kmem_cache_destroy(xfs_inode_zone);
+	kmem_cache_destroy(xfs_trans_zone);
+	kmem_cache_destroy(xfs_da_state_zone);
+	kmem_cache_destroy(xfs_dabuf_zone);
+	kmem_cache_destroy(xfs_buf_item_zone);
+	kmem_cache_destroy(xfs_efd_zone);
+	kmem_cache_destroy(xfs_efi_zone);
+	kmem_cache_destroy(xfs_ifork_zone);
+	kmem_cache_destroy(xfs_ili_zone);
+	kmem_cache_destroy(xfs_chashlist_zone);
+	_XQM_ZONE_DESTROY(qm_dqzone);
+	_XQM_ZONE_DESTROY(qm_dqtrxzone);
+	_ACL_ZONE_DESTROY(xfs_acl_zone);
+#if  (defined(DEBUG) || defined(CONFIG_XFS_VNODE_TRACING))
+	ktrace_uninit();
+#endif
+}
+
+/*
+ * xfs_cmountfs
+ *
+ * This function is the common mount file system function for XFS.
+ */
+STATIC int
+xfs_cmountfs(
+	vfs_t		*vfsp,
+	dev_t		ddev,
+	dev_t		logdev,
+	dev_t		rtdev,
+	struct xfs_mount_args *ap,
+	struct cred	*cr)
+{
+	xfs_mount_t	*mp;
+	int		error = 0;
+
+
+	/*
+	 * Allocate VFS private data (xfs mount structure).
+	 */
+	mp = xfs_mount_init();
+
+	vfs_insertbhv(vfsp, &mp->m_bhv, &xfs_vfsops, mp);
+
+	/*
+	 * Open data, real time, and log devices now - order is important.
+	 */
+	mp->m_ddev_targp = pagebuf_lock_enable(ddev, 0);
+	if (IS_ERR(mp->m_ddev_targp)) {
+		error = PTR_ERR(mp->m_ddev_targp);
+		goto error2;
+	}
+
+	if (rtdev != 0) {
+		mp->m_rtdev_targp = 
+				pagebuf_lock_enable(rtdev, 1);
+		if (IS_ERR(mp->m_rtdev_targp)) {
+			error = PTR_ERR(mp->m_rtdev_targp);
+			pagebuf_lock_disable(mp->m_ddev_targp, 0);
+			goto error2;
+		}
+	}
+
+	if (logdev != ddev) {
+		mp->m_logdev_targp = 
+				pagebuf_lock_enable(logdev, 1);
+		if (IS_ERR(mp->m_logdev_targp)) {
+			error = PTR_ERR(mp->m_logdev_targp);
+			pagebuf_lock_disable(mp->m_ddev_targp, 1);
+			if (mp->m_rtdev_targp)
+				pagebuf_lock_disable(mp->m_rtdev_targp, 1);
+			goto error2;
+		}
+	}
+
+	/* Values are in BBs */
+	if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
+		/*
+		 * At this point the superblock has not been read
+		 * in, therefore we do not know the block size.
+		 * Before, the mount call ends we will convert
+		 * these to FSBs.
+		 */
+		mp->m_dalign = ap->sunit;
+		mp->m_swidth = ap->swidth;
+	} else {
+		mp->m_dalign = 0;
+		mp->m_swidth = 0;
+	}
+
+	if (logdev != 0) {
+		if (logdev == ddev) {
+			mp->m_logdev_targp = mp->m_ddev_targp;
+		} else {
+			/* Set the log device's block size */
+			set_blocksize(mp->m_logdev_targp->pbr_bdev, 512);
+		}
+
+		if (ap->logbufs != 0 && ap->logbufs != -1 &&
+		    (ap->logbufs < XLOG_NUM_ICLOGS ||
+		     ap->logbufs > XLOG_MAX_ICLOGS)) {
+			cmn_err(CE_WARN, 
+				"XFS: invalid logbufs value: %d [not %d-%d]\n",
+				ap->logbufs, XLOG_NUM_ICLOGS, XLOG_MAX_ICLOGS);
+			error = XFS_ERROR(EINVAL);
+			goto error3;
+		}
+		mp->m_logbufs = ap->logbufs;
+		if (ap->logbufsize != -1 &&
+		    ap->logbufsize != 16 * 1024 &&
+		    ap->logbufsize != 32 * 1024 &&
+		    ap->logbufsize != 64 * 1024 &&
+		    ap->logbufsize != 128 * 1024 &&
+		    ap->logbufsize != 256 * 1024) {
+			cmn_err(CE_WARN,
+		"XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]\n",
+				ap->logbufsize);
+			error = XFS_ERROR(EINVAL);
+			goto error3;
+		}
+		mp->m_logbsize = ap->logbufsize;
+		mp->m_fsname_len = strlen(ap->fsname) + 1;
+		mp->m_fsname = kmem_alloc(mp->m_fsname_len, KM_SLEEP);
+		strcpy(mp->m_fsname, ap->fsname);
+	}
+	if (rtdev != 0) {
+		if (rtdev == ddev || rtdev == logdev) {
+			cmn_err(CE_WARN,
+	"XFS: Cannot mount filesystem with identical rtdev and logdev.");
+			error = XFS_ERROR(EINVAL);
+			goto error3;
+		} else {
+			/* Set the realtime device's block size */
+			set_blocksize(mp->m_rtdev_targp->pbr_bdev, 512);
+		}
+	}
+
+	/*
+	 * Pull in the 'wsync' and 'ino64' mount options before we do the real
+	 * work of mounting and recovery.  The arg pointer will
+	 * be NULL when we are being called from the root mount code.
+	 */
+#if XFS_BIG_FILESYSTEMS
+	mp->m_inoadd = 0;
+#endif
+	if (ap != NULL) {
+		if (ap->flags & XFSMNT_WSYNC)
+			mp->m_flags |= XFS_MOUNT_WSYNC;
+#if XFS_BIG_FILESYSTEMS
+		if (ap->flags & XFSMNT_INO64) {
+			mp->m_flags |= XFS_MOUNT_INO64;
+			mp->m_inoadd = XFS_INO64_OFFSET;
+		}
+#endif
+		if (ap->flags & XFSMNT_NOATIME)
+			mp->m_flags |= XFS_MOUNT_NOATIME;
+
+		if (ap->flags & (XFSMNT_UQUOTA | XFSMNT_GQUOTA))
+			xfs_qm_mount_quotainit(mp, ap->flags);
+
+		if (ap->flags & XFSMNT_RETERR)
+			mp->m_flags |= XFS_MOUNT_RETERR;
+
+		if (ap->flags & XFSMNT_NOALIGN)
+			mp->m_flags |= XFS_MOUNT_NOALIGN;
+
+		if (ap->flags & XFSMNT_OSYNCISOSYNC)
+			mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
+
+		/* Default on Linux */
+		if ( 1 || ap->flags & XFSMNT_32BITINODES)
+			mp->m_flags |= XFS_MOUNT_32BITINODES;
+
+		if (ap->flags & XFSMNT_IRIXSGID)
+			mp->m_flags |= XFS_MOUNT_IRIXSGID;
+
+		if (ap->flags & XFSMNT_IOSIZE) {
+			if (ap->iosizelog > XFS_MAX_IO_LOG ||
+			    ap->iosizelog < XFS_MIN_IO_LOG) {
+				cmn_err(CE_WARN,
+			"XFS: invalid log iosize: %d [not %d-%d]",
+					ap->iosizelog, XFS_MIN_IO_LOG,
+					XFS_MAX_IO_LOG);
+				error = XFS_ERROR(EINVAL);
+				goto error3;
+			}
+
+			mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
+			mp->m_readio_log = mp->m_writeio_log = ap->iosizelog;
+		}
+
+		/*
+		 * no recovery flag requires a read-only mount
+		 */
+		if (ap->flags & XFSMNT_NORECOVERY) {
+			if (!(vfsp->vfs_flag & VFS_RDONLY)) {
+				cmn_err(CE_WARN,
+		"XFS: tried to mount a FS read-write without recovery!");
+				error = XFS_ERROR(EINVAL);
+				goto error3;
+			}
+			mp->m_flags |= XFS_MOUNT_NORECOVERY;
+		}
+
+		if (ap->flags & XFSMNT_NOUUID)
+			mp->m_flags |= XFS_MOUNT_NOUUID;
+		if (ap->flags & XFSMNT_NOLOGFLUSH)
+			mp->m_flags |= XFS_MOUNT_NOLOGFLUSH;
+	}
+
+	/*
+	 * read in superblock to check read-only flags and shared
+	 * mount status
+	 */
+	if ((error = xfs_readsb(mp)))
+		goto error3;
+
+	/* Fail a mount where the logbuf is smaller then the log stripe */
+	if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) {
+		if (((ap->logbufsize == -1) &&
+		     (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) ||
+		    (ap->logbufsize < mp->m_sb.sb_logsunit)) {
+			cmn_err(CE_WARN, "XFS: "
+				"logbuf size must be greater than or equal to log stripe size");
+			xfs_freesb(mp);
+			error = XFS_ERROR(EINVAL);
+			goto error3;
+		}
+	} else {
+		/* Fail a mount if the logbuf is larger than 32K */
+		if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) {
+			cmn_err(CE_WARN, "XFS: "
+		"XFS: logbuf size for version 1 logs must be 16K or 32K");
+			xfs_freesb(mp);
+			error = XFS_ERROR(EINVAL);
+			goto error3;
+		}
+	}
+
+	pagebuf_target_blocksize(mp->m_ddev_targp, mp->m_sb.sb_blocksize);
+	if (logdev != 0 && logdev != ddev)
+		pagebuf_target_blocksize(mp->m_logdev_targp, mp->m_sb.sb_blocksize);
+	if (rtdev != 0)
+		pagebuf_target_blocksize(mp->m_rtdev_targp, mp->m_sb.sb_blocksize);
+
+	/*
+	 * prohibit r/w mounts of read-only filesystems
+	 */
+	if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) &&
+	    !(vfsp->vfs_flag & VFS_RDONLY)) {
+		cmn_err(CE_WARN, "XFS: "
+			"cannot mount a read-only filesystem as read-write");
+		error = XFS_ERROR(EROFS);
+		xfs_freesb(mp);
+		goto error3;
+	}
+
+	/*
+	 * check for shared mount.
+	 */
+	if (ap && ap->flags & XFSMNT_SHARED) {
+		if (!XFS_SB_VERSION_HASSHARED(&mp->m_sb)) {
+			error = XFS_ERROR(EINVAL);
+			xfs_freesb(mp);
+			goto error3;
+		}
+
+		/*
+		 * For IRIX 6.5, shared mounts must have the shared
+		 * version bit set, have the persistent readonly
+		 * field set, must be version 0 and can only be mounted
+		 * read-only.
+		 */
+		if (!(vfsp->vfs_flag & VFS_RDONLY) ||
+		    !(mp->m_sb.sb_flags & XFS_SBF_READONLY) ||
+		    mp->m_sb.sb_shared_vn != 0) {
+			error = XFS_ERROR(EINVAL);
+			xfs_freesb(mp);
+			goto error3;
+		}
+
+		mp->m_flags |= XFS_MOUNT_SHARED;
+
+		/*
+		 * Shared XFS V0 can't deal with DMI.  Return EINVAL.
+		 */
+		if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI)) {
+			error = XFS_ERROR(EINVAL);
+			xfs_freesb(mp);
+			goto error3;
+		}
+	}
+
+	if ((error = xfs_mountfs(vfsp, mp, ddev, 0)) == 0)
+		return 0;
+
+	/*
+	 * Be careful not to clobber the value of 'error' here.
+	 */
+ error3:
+	/* It's impossible to get here before buftargs are filled */
+	xfs_binval(mp->m_ddev_targp);
+	pagebuf_lock_disable(mp->m_ddev_targp, 0);
+	if (logdev && logdev != ddev) {
+		xfs_binval(mp->m_logdev_targp);
+		pagebuf_lock_disable(mp->m_logdev_targp, 1);
+	}
+	if (rtdev != 0) {
+		xfs_binval(mp->m_rtdev_targp);
+		pagebuf_lock_disable(mp->m_rtdev_targp, 1);
+	}
+ error2:
+	if (error) {
+		xfs_mount_free(mp, 1);
+	}
+	return error;
+}
+
+/*
+ * xfs_mount
+ *
+ * The file system configurations are:
+ *	(1) device (partition) with data and internal log
+ *	(2) logical volume with data and log subvolumes.
+ *	(3) logical volume with data, log, and realtime subvolumes.
+ */
+STATIC int
+xfs_mount(
+	vfs_t			*vfsp,
+	struct xfs_mount_args	*args,
+	cred_t			*credp)
+{
+	dev_t		ddev;
+	dev_t		logdev;
+	dev_t		rtdev;
+	int		error;
+
+	error = spectodevs(vfsp->vfs_super, args, &ddev, &logdev, &rtdev);
+	if (!error)
+		error = xfs_cmountfs(vfsp, ddev, logdev, rtdev, args, credp);
+	return (error);
+}
+
+/*
+ * xfs_ibusy searches for a busy inode in the mounted file system.
+ *
+ * Return 0 if there are no active inodes otherwise return 1.
+ */
+STATIC int
+xfs_ibusy(
+	xfs_mount_t	*mp)
+{
+	xfs_inode_t	*ip;
+	vnode_t		*vp;
+	int		busy;
+
+	busy = 0;
+
+	XFS_MOUNT_ILOCK(mp);
+
+	ip = mp->m_inodes;
+	if (ip == NULL) {
+		XFS_MOUNT_IUNLOCK(mp);
+		return busy;
+	}
+
+	do {
+		/* Skip markers inserted by xfs_sync */
+		if (ip->i_mount == NULL) {
+			ip = ip->i_mnext;
+			continue;
+		}
+
+		vp = XFS_ITOV_NULL(ip);
+
+		if (vp && vn_count(vp) != 0) {
+			if (xfs_ibusy_check(ip, vn_count(vp)) == 0) {
+				ip = ip->i_mnext;
+				continue;
+			}
+#ifdef DEBUG
+			printk("busy vp=0x%p ip=0x%p inum %Ld count=%d\n",
+				vp, ip, ip->i_ino, vn_count(vp));
+#endif
+			busy++;
+		}
+		ip = ip->i_mnext;
+	} while ((ip != mp->m_inodes) && !busy);
+
+	XFS_MOUNT_IUNLOCK(mp);
+
+	return busy;
+}
+
+
+STATIC int
+xfs_unmount(
+	bhv_desc_t	*bdp,
+	int		flags,
+	cred_t		*credp)
+{
+	xfs_mount_t	*mp;
+	xfs_inode_t	*rip;
+	vnode_t		*rvp = 0;
+	struct vfs	*vfsp = bhvtovfs(bdp);
+	int		unmount_event_wanted = 0;
+	int		unmount_event_flags = 0;
+	int		xfs_unmountfs_needed = 0;
+	int		error;
+
+	mp = XFS_BHVTOM(bdp);
+	rip = mp->m_rootip;
+	rvp = XFS_ITOV(rip);
+
+	if (vfsp->vfs_flag & VFS_DMI) {
+		bhv_desc_t	*rbdp;
+
+		rbdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(rvp), &xfs_vnodeops);
+		error = dm_send_namesp_event(DM_EVENT_PREUNMOUNT,
+				rbdp, DM_RIGHT_NULL, rbdp, DM_RIGHT_NULL,
+				NULL, NULL, 0, 0,
+				(mp->m_dmevmask & (1<<DM_EVENT_PREUNMOUNT))?
+					0:DM_FLAGS_UNWANTED);
+			if (error)
+				return XFS_ERROR(error);
+		unmount_event_wanted = 1;
+		unmount_event_flags = (mp->m_dmevmask & (1<<DM_EVENT_UNMOUNT))?
+					0 : DM_FLAGS_UNWANTED;
+	}
+
+	/*
+	 * First blow any referenced inode from this file system
+	 * out of the reference cache, and delete the timer.
+	 */
+	xfs_refcache_purge_mp(mp);
+	del_timer_sync(&mp->m_sbdirty_timer);
+
+	/*
+	 * Make sure there are no active users.
+	 */
+	if (xfs_ibusy(mp)) {
+		error = XFS_ERROR(EBUSY);
+		printk("xfs_unmount: xfs_ibusy says error/%d\n", error);
+		goto out;
+	}
+
+	XFS_bflush(mp->m_ddev_targp);
+	error = xfs_unmount_flush(mp, 0);
+	if (error)
+		goto out;
+
+	ASSERT(vn_count(rvp) == 1);
+
+	/*
+	 * Drop the reference count, and then
+	 * run the vnode through vn_remove.
+	 */
+	rvp->v_flag |= VPURGE;			/* OK for vn_purge */
+	VN_RELE(rvp);
+
+	vn_remove(rvp);
+
+	/*
+	 * If we're forcing a shutdown, typically because of a media error,
+	 * we want to make sure we invalidate dirty pages that belong to
+	 * referenced vnodes as well.
+	 */
+	if (XFS_FORCED_SHUTDOWN(mp))  {
+		error = xfs_sync(&mp->m_bhv,
+			 (SYNC_WAIT | SYNC_CLOSE), credp);
+		ASSERT(error != EFSCORRUPTED);
+	}
+	xfs_unmountfs_needed = 1;
+
+out:
+	/*	Send DMAPI event, if required.
+	 *	Then do xfs_unmountfs() if needed.
+	 *	Then return error (or zero).
+	 */
+	if (unmount_event_wanted) {
+		/* Note: mp structure must still exist for
+		 * dm_send_unmount_event() call.
+		 */
+		dm_send_unmount_event(vfsp, error == 0 ? rvp : NULL,
+			DM_RIGHT_NULL, 0, error, unmount_event_flags);
+	}
+	if (xfs_unmountfs_needed) {
+		/*
+		 * Call common unmount function to flush to disk
+		 * and free the super block buffer & mount structures.
+		 */
+		xfs_unmountfs(mp, credp);
+	}
+
+	return XFS_ERROR(error);
+}
+
+/*
+ * xfs_unmount_flush implements a set of flush operation on special
+ * inodes, which are needed as a separate set of operations so that
+ * they can be called as part of relocation process.
+ */
+int
+xfs_unmount_flush(
+	xfs_mount_t	*mp,		/* Mount structure we are getting
+					   rid of. */
+	int		relocation)	/* Called from vfs relocation. */
+{
+	xfs_inode_t	*rip = mp->m_rootip;
+	xfs_inode_t	*rbmip;
+	xfs_inode_t	*rsumip=NULL;
+	vnode_t		*rvp = XFS_ITOV(rip);
+	int		error;
+
+	xfs_ilock(rip, XFS_ILOCK_EXCL);
+	xfs_iflock(rip);
+
+	/*
+	 * Flush out the real time inodes.
+	 */
+	if ((rbmip = mp->m_rbmip) != NULL) {
+		xfs_ilock(rbmip, XFS_ILOCK_EXCL);
+		xfs_iflock(rbmip);
+		error = xfs_iflush(rbmip, XFS_IFLUSH_SYNC);
+		xfs_iunlock(rbmip, XFS_ILOCK_EXCL);
+
+		if (error == EFSCORRUPTED)
+			goto fscorrupt_out;
+
+		ASSERT(vn_count(XFS_ITOV(rbmip)) == 1);
+
+		rsumip = mp->m_rsumip;
+		xfs_ilock(rsumip, XFS_ILOCK_EXCL);
+		xfs_iflock(rsumip);
+		error = xfs_iflush(rsumip, XFS_IFLUSH_SYNC);
+		xfs_iunlock(rsumip, XFS_ILOCK_EXCL);
+
+		if (error == EFSCORRUPTED)
+			goto fscorrupt_out;
+
+		ASSERT(vn_count(XFS_ITOV(rsumip)) == 1);
+	}
+
+	/*
+	 * synchronously flush root inode to disk
+	 */
+	error = xfs_iflush(rip, XFS_IFLUSH_SYNC);
+
+	if (error == EFSCORRUPTED)
+		goto fscorrupt_out2;
+
+	if (vn_count(rvp) != 1 && !relocation) {
+		xfs_iunlock(rip, XFS_ILOCK_EXCL);
+		error = XFS_ERROR(EBUSY);
+		return (error);
+	}
+	/*
+	 * Release dquot that rootinode, rbmino and rsumino might be holding,
+	 * flush and purge the quota inodes.
+	 */
+	error = xfs_qm_unmount_quotas(mp);
+	if (error == EFSCORRUPTED)
+		goto fscorrupt_out2;
+
+	if (rbmip) {
+		VN_RELE(XFS_ITOV(rbmip));
+		VN_RELE(XFS_ITOV(rsumip));
+	}
+
+	xfs_iunlock(rip, XFS_ILOCK_EXCL);
+	return (0);
+
+fscorrupt_out:
+	xfs_ifunlock(rip);
+
+fscorrupt_out2:
+	xfs_iunlock(rip, XFS_ILOCK_EXCL);
+
+	error = XFS_ERROR(EFSCORRUPTED);
+	return (error);
+}
+
+/*
+ * xfs_root extracts the root vnode from a vfs.
+ *
+ * vfsp -- the vfs struct for the desired file system
+ * vpp	-- address of the caller's vnode pointer which should be
+ *	   set to the desired fs root vnode
+ */
+STATIC int
+xfs_root(
+	bhv_desc_t	*bdp,
+	vnode_t		**vpp)
+{
+	vnode_t *vp;
+
+	vp = XFS_ITOV((XFS_BHVTOM(bdp))->m_rootip);
+	VN_HOLD(vp);
+	*vpp = vp;
+
+	return 0;
+}
+
+/*
+ * xfs_statvfs
+ *
+ * Fill in the statvfs structure for the given file system.  We use
+ * the superblock lock in the mount structure to ensure a consistent
+ * snapshot of the counters returned.
+ */
+STATIC int
+xfs_statvfs(
+	bhv_desc_t	*bdp,
+	struct statfs	*statp,
+	vnode_t		*vp)
+{
+	__uint64_t	fakeinos;
+	xfs_extlen_t	lsize;
+	xfs_mount_t	*mp;
+	xfs_sb_t	*sbp;
+	unsigned long	s;
+
+	mp = XFS_BHVTOM(bdp);
+	sbp = &(mp->m_sb);
+
+	statp->f_type = XFS_SB_MAGIC;
+
+	s = XFS_SB_LOCK(mp);
+	statp->f_bsize = sbp->sb_blocksize;
+	lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
+	statp->f_blocks = sbp->sb_dblocks - lsize;
+	statp->f_bfree = statp->f_bavail = sbp->sb_fdblocks;
+	fakeinos = statp->f_bfree << sbp->sb_inopblog;
+#if XFS_BIG_FILESYSTEMS
+	fakeinos += mp->m_inoadd;
+#endif
+	statp->f_files =
+	    MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
+	if (mp->m_maxicount)
+#if XFS_BIG_FILESYSTEMS
+		if (!mp->m_inoadd)
+#endif
+			statp->f_files =
+			    MIN(statp->f_files, (long)mp->m_maxicount);
+	statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+	XFS_SB_UNLOCK(mp, s);
+
+	statp->f_fsid.val[0] = mp->m_dev;
+	statp->f_fsid.val[1] = 0;
+	statp->f_namelen = MAXNAMELEN - 1;
+
+	return 0;
+}
+
+
+/*
+ * xfs_sync flushes any pending I/O to file system vfsp.
+ *
+ * This routine is called by vfs_sync() to make sure that things make it
+ * out to disk eventually, on sync() system calls to flush out everything,
+ * and when the file system is unmounted.  For the vfs_sync() case, all
+ * we really need to do is sync out the log to make all of our meta-data
+ * updates permanent (except for timestamps).  For calls from pflushd(),
+ * dirty pages are kept moving by calling pdflush() on the inodes
+ * containing them.  We also flush the inodes that we can lock without
+ * sleeping and the superblock if we can lock it without sleeping from
+ * vfs_sync() so that items at the tail of the log are always moving out.
+ *
+ * Flags:
+ *	SYNC_BDFLUSH - We're being called from vfs_sync() so we don't want
+ *		       to sleep if we can help it.  All we really need
+ *		       to do is ensure that the log is synced at least
+ *		       periodically.  We also push the inodes and
+ *		       superblock if we can lock them without sleeping
+ *			and they are not pinned.
+ *	SYNC_ATTR    - We need to flush the inodes.  If SYNC_BDFLUSH is not
+ *		       set, then we really want to lock each inode and flush
+ *		       it.
+ *	SYNC_WAIT    - All the flushes that take place in this call should
+ *		       be synchronous.
+ *	SYNC_DELWRI  - This tells us to push dirty pages associated with
+ *		       inodes.	SYNC_WAIT and SYNC_BDFLUSH are used to
+ *		       determine if they should be flushed sync, async, or
+ *		       delwri.
+ *	SYNC_CLOSE   - This flag is passed when the system is being
+ *		       unmounted.  We should sync and invalidate everthing.
+ *	SYNC_FSDATA  - This indicates that the caller would like to make
+ *		       sure the superblock is safe on disk.  We can ensure
+ *		       this by simply makeing sure the log gets flushed
+ *		       if SYNC_BDFLUSH is set, and by actually writing it
+ *		       out otherwise.
+ *
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_sync(
+	bhv_desc_t	*bdp,
+	int		flags,
+	cred_t		*credp)
+{
+	xfs_mount_t	*mp;
+
+	mp = XFS_BHVTOM(bdp);
+	return (xfs_syncsub(mp, flags, 0, NULL));
+}
+
+/*
+ * xfs sync routine for internal use
+ *
+ * This routine supports all of the flags defined for the generic VFS_SYNC
+ * interface as explained above under xys_sync.	 In the interests of not
+ * changing interfaces within the 6.5 family, additional internallly-
+ * required functions are specified within a separate xflags parameter,
+ * only available by calling this routine.
+ *
+ * xflags:
+ *	XFS_XSYNC_RELOC - Sync for relocation.	Don't try to get behavior
+ *			  locks as this will cause you to hang.	 Not all
+ *			  combinations of flags are necessarily supported
+ *			  when this is specified.
+ */
+int
+xfs_syncsub(
+	xfs_mount_t	*mp,
+	int		flags,
+	int		xflags,
+	int		*bypassed)
+{
+	xfs_inode_t	*ip = NULL;
+	xfs_inode_t	*ip_next;
+	xfs_buf_t	*bp;
+	vnode_t		*vp = NULL;
+	vmap_t		vmap;
+	int		error;
+	int		last_error;
+	uint64_t	fflag;
+	uint		lock_flags;
+	uint		base_lock_flags;
+	uint		log_flags;
+	boolean_t	mount_locked;
+	boolean_t	vnode_refed;
+	int		preempt;
+	int		do_mmap_flush;
+	xfs_dinode_t	*dip;
+	xfs_buf_log_item_t	*bip;
+	xfs_iptr_t	*ipointer;
+#ifdef DEBUG
+	boolean_t	ipointer_in = B_FALSE;
+
+#define IPOINTER_SET	ipointer_in = B_TRUE
+#define IPOINTER_CLR	ipointer_in = B_FALSE
+#else
+#define IPOINTER_SET
+#define IPOINTER_CLR
+#endif
+
+
+/* Insert a marker record into the inode list after inode ip. The list
+ * must be locked when this is called. After the call the list will no
+ * longer be locked.
+ */
+#define IPOINTER_INSERT(ip, mp) { \
+		ASSERT(ipointer_in == B_FALSE); \
+		ipointer->ip_mnext = ip->i_mnext; \
+		ipointer->ip_mprev = ip; \
+		ip->i_mnext = (xfs_inode_t *)ipointer; \
+		ipointer->ip_mnext->i_mprev = (xfs_inode_t *)ipointer; \
+		preempt = 0; \
+		XFS_MOUNT_IUNLOCK(mp); \
+		mount_locked = B_FALSE; \
+		IPOINTER_SET; \
+	}
+
+/* Remove the marker from the inode list. If the marker was the only item
+ * in the list then there are no remaining inodes and we should zero out
+ * the whole list. If we are the current head of the list then move the head
+ * past us.
+ */
+#define IPOINTER_REMOVE(ip, mp) { \
+		ASSERT(ipointer_in == B_TRUE); \
+		if (ipointer->ip_mnext != (xfs_inode_t *)ipointer) { \
+			ip = ipointer->ip_mnext; \
+			ip->i_mprev = ipointer->ip_mprev; \
+			ipointer->ip_mprev->i_mnext = ip; \
+			if (mp->m_inodes == (xfs_inode_t *)ipointer) { \
+				mp->m_inodes = ip; \
+			} \
+		} else { \
+			ASSERT(mp->m_inodes == (xfs_inode_t *)ipointer); \
+			mp->m_inodes = NULL; \
+			ip = NULL; \
+		} \
+		IPOINTER_CLR; \
+	}
+
+#define PREEMPT_MASK	0x7f
+
+	if (bypassed)
+		*bypassed = 0;
+	if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY)
+		return 0;
+	error = 0;
+	last_error = 0;
+	preempt = 0;
+
+	/* Allocate a reference marker */
+	ipointer = (xfs_iptr_t *)kmem_zalloc(sizeof(xfs_iptr_t), KM_SLEEP);
+
+	fflag = XFS_B_ASYNC;		/* default is don't wait */
+	if (flags & SYNC_BDFLUSH)
+		fflag = XFS_B_DELWRI;
+	if (flags & SYNC_WAIT)
+		fflag = 0;		/* synchronous overrides all */
+	do_mmap_flush = (flags & (SYNC_DELWRI|SYNC_BDFLUSH)) !=
+						(SYNC_DELWRI|SYNC_BDFLUSH);
+
+	base_lock_flags = XFS_ILOCK_SHARED;
+	if (flags & (SYNC_DELWRI | SYNC_CLOSE)) {
+		/*
+		 * We need the I/O lock if we're going to call any of
+		 * the flush/inval routines.
+		 */
+		base_lock_flags |= XFS_IOLOCK_SHARED;
+	}
+
+	/*
+	 * Sync out the log.  This ensures that the log is periodically
+	 * flushed even if there is not enough activity to fill it up.
+	 */
+	if (flags & SYNC_WAIT) {
+		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+	} else {
+		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+	}
+
+	XFS_MOUNT_ILOCK(mp);
+
+	ip = mp->m_inodes;
+
+	mount_locked = B_TRUE;
+	vnode_refed  = B_FALSE;
+
+	IPOINTER_CLR;
+
+	do {
+		ASSERT(ipointer_in == B_FALSE);
+		ASSERT(vnode_refed == B_FALSE);
+
+		lock_flags = base_lock_flags;
+
+		/*
+		 * There were no inodes in the list, just break out
+		 * of the loop.
+		 */
+		if (ip == NULL) {
+			break;
+		}
+
+		/*
+		 * We found another sync thread marker - skip it
+		 */
+		if (ip->i_mount == NULL) {
+			ip = ip->i_mnext;
+			continue;
+		}
+
+		vp = XFS_ITOV_NULL(ip);
+
+		/*
+		 * If the vnode is gone then this is being torn down,
+		 * call reclaim if it is flushed, else let regular flush
+		 * code deal with it later in the loop.
+		 */
+
+		if (vp == NULL) {
+			/* Skip ones already in reclaim */
+			if (ip->i_flags & XFS_IRECLAIM) {
+				ip = ip->i_mnext;
+				continue;
+			}
+			if ((ip->i_update_core == 0) &&
+			    ((ip->i_itemp == NULL) ||
+			    !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL))) {
+				if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
+					ip = ip->i_mnext;
+				} else if ((xfs_ipincount(ip) == 0) &&
+				    xfs_iflock_nowait(ip)) {
+					IPOINTER_INSERT(ip, mp);
+
+					xfs_finish_reclaim(ip, 1,
+						XFS_IFLUSH_DELWRI_ELSE_SYNC);
+
+					XFS_MOUNT_ILOCK(mp);
+					mount_locked = B_TRUE;
+					IPOINTER_REMOVE(ip, mp);
+				} else {
+					xfs_iunlock(ip, XFS_ILOCK_EXCL);
+					ip = ip->i_mnext;
+				}
+				continue;
+			}
+		}
+
+		if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) {
+			XFS_MOUNT_IUNLOCK(mp);
+			kmem_free(ipointer, sizeof(xfs_iptr_t));
+			return 0;
+		}
+
+		/*
+		 * If this is just vfs_sync() or pflushd() calling
+		 * then we can skip inodes for which it looks like
+		 * there is nothing to do.  Since we don't have the
+		 * inode locked this is racey, but these are periodic
+		 * calls so it doesn't matter.	For the others we want
+		 * to know for sure, so we at least try to lock them.
+		 */
+		if (flags & SYNC_BDFLUSH) {
+			if (((ip->i_itemp == NULL) ||
+			     !(ip->i_itemp->ili_format.ilf_fields &
+			       XFS_ILOG_ALL)) &&
+			    (ip->i_update_core == 0)) {
+				ip = ip->i_mnext;
+				continue;
+			}
+		}
+
+		/*
+		 * Try to lock without sleeping.  We're out of order with
+		 * the inode list lock here, so if we fail we need to drop
+		 * the mount lock and try again.  If we're called from
+		 * bdflush() here, then don't bother.
+		 *
+		 * The inode lock here actually coordinates with the
+		 * almost spurious inode lock in xfs_ireclaim() to prevent
+		 * the vnode we handle here without a reference from
+		 * being freed while we reference it.  If we lock the inode
+		 * while it's on the mount list here, then the spurious inode
+		 * lock in xfs_ireclaim() after the inode is pulled from
+		 * the mount list will sleep until we release it here.
+		 * This keeps the vnode from being freed while we reference
+		 * it.	It is also cheaper and simpler than actually doing
+		 * a vn_get() for every inode we touch here.
+		 */
+		if (xfs_ilock_nowait(ip, lock_flags) == 0) {
+
+			if ((flags & SYNC_BDFLUSH) || (vp == NULL)) {
+				ip = ip->i_mnext;
+				continue;
+			}
+
+			/*
+			 * We need to unlock the inode list lock in order
+			 * to lock the inode. Insert a marker record into
+			 * the inode list to remember our position, dropping
+			 * the lock is now done inside the IPOINTER_INSERT
+			 * macro.
+			 *
+			 * We also use the inode list lock to protect us
+			 * in taking a snapshot of the vnode version number
+			 * for use in calling vn_get().
+			 */
+			VMAP(vp, ip, vmap);
+			IPOINTER_INSERT(ip, mp);
+
+			vp = vn_get(vp, &vmap);
+			if (vp == NULL) {
+				/*
+				 * The vnode was reclaimed once we let go
+				 * of the inode list lock.  Skip to the
+				 * next list entry. Remove the marker.
+				 */
+
+				XFS_MOUNT_ILOCK(mp);
+
+				mount_locked = B_TRUE;
+				vnode_refed  = B_FALSE;
+
+				IPOINTER_REMOVE(ip, mp);
+
+				continue;
+			}
+
+			xfs_ilock(ip, lock_flags);
+
+			ASSERT(vp == XFS_ITOV(ip));
+			ASSERT(ip->i_mount == mp);
+
+			vnode_refed = B_TRUE;
+		}
+
+		/* From here on in the loop we may have a marker record
+		 * in the inode list.
+		 */
+
+		if ((flags & SYNC_CLOSE)  && (vp != NULL)) {
+			/*
+			 * This is the shutdown case.  We just need to
+			 * flush and invalidate all the pages associated
+			 * with the inode.  Drop the inode lock since
+			 * we can't hold it across calls to the buffer
+			 * cache.
+			 *
+			 * We don't set the VREMAPPING bit in the vnode
+			 * here, because we don't hold the vnode lock
+			 * exclusively.	 It doesn't really matter, though,
+			 * because we only come here when we're shutting
+			 * down anyway.
+			 */
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+			if (XFS_FORCED_SHUTDOWN(mp)) {
+				if (xflags & XFS_XSYNC_RELOC) {
+					fs_tosspages(XFS_ITOBHV(ip), 0, -1,
+						     FI_REMAPF);
+				}
+				else {
+					VOP_TOSS_PAGES(vp, 0, -1, FI_REMAPF);
+				}
+			} else {
+				if (xflags & XFS_XSYNC_RELOC) {
+					fs_flushinval_pages(XFS_ITOBHV(ip),
+							    0, -1, FI_REMAPF);
+				}
+				else {
+					VOP_FLUSHINVAL_PAGES(vp, 0, -1, FI_REMAPF);
+				}
+			}
+
+			xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+		} else if ((flags & SYNC_DELWRI) && (vp != NULL)) {
+			if (VN_DIRTY(vp)) {
+				/* We need to have dropped the lock here,
+				 * so insert a marker if we have not already
+				 * done so.
+				 */
+				if (mount_locked) {
+					IPOINTER_INSERT(ip, mp);
+				}
+
+				/*
+				 * Drop the inode lock since we can't hold it
+				 * across calls to the buffer cache.
+				 */
+				xfs_iunlock(ip, XFS_ILOCK_SHARED);
+				if (do_mmap_flush) {
+					VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1,
+						fflag, FI_NONE, error);
+				} else {
+					filemap_fdatawrite(LINVFS_GET_IP(vp)->i_mapping);
+				}
+				xfs_ilock(ip, XFS_ILOCK_SHARED);
+			}
+
+		}
+
+		if (flags & SYNC_BDFLUSH) {
+			if ((flags & SYNC_ATTR) &&
+			    ((ip->i_update_core) ||
+			     ((ip->i_itemp != NULL) &&
+			      (ip->i_itemp->ili_format.ilf_fields != 0)))) {
+
+				/* Insert marker and drop lock if not already
+				 * done.
+				 */
+				if (mount_locked) {
+					IPOINTER_INSERT(ip, mp);
+				}
+
+				/*
+				 * We don't want the periodic flushing of the
+				 * inodes by vfs_sync() to interfere with
+				 * I/O to the file, especially read I/O
+				 * where it is only the access time stamp
+				 * that is being flushed out.  To prevent
+				 * long periods where we have both inode
+				 * locks held shared here while reading the
+				 * inode's buffer in from disk, we drop the
+				 * inode lock while reading in the inode
+				 * buffer.  We have to release the buffer
+				 * and reacquire the inode lock so that they
+				 * are acquired in the proper order (inode
+				 * locks first).  The buffer will go at the
+				 * end of the lru chain, though, so we can
+				 * expect it to still be there when we go
+				 * for it again in xfs_iflush().
+				 */
+				if ((xfs_ipincount(ip) == 0) &&
+				    xfs_iflock_nowait(ip)) {
+
+					xfs_ifunlock(ip);
+					xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+					error = xfs_itobp(mp, NULL, ip,
+							  &dip, &bp, 0);
+					if (!error) {
+						xfs_buf_relse(bp);
+					} else {
+						/* Bailing out, remove the
+						 * marker and free it.
+						 */
+						XFS_MOUNT_ILOCK(mp);
+
+						IPOINTER_REMOVE(ip, mp);
+
+						XFS_MOUNT_IUNLOCK(mp);
+
+						ASSERT(!(lock_flags &
+							XFS_IOLOCK_SHARED));
+
+						kmem_free(ipointer,
+							sizeof(xfs_iptr_t));
+						return (0);
+					}
+
+					/*
+					 * Since we dropped the inode lock,
+					 * the inode may have been reclaimed.
+					 * Therefore, we reacquire the mount
+					 * lock and check to see if we were the
+					 * inode reclaimed. If this happened
+					 * then the ipointer marker will no
+					 * longer point back at us. In this
+					 * case, move ip along to the inode
+					 * after the marker, remove the marker
+					 * and continue.
+					 */
+					XFS_MOUNT_ILOCK(mp);
+					mount_locked = B_TRUE;
+
+					if (ip != ipointer->ip_mprev) {
+						IPOINTER_REMOVE(ip, mp);
+
+						ASSERT(!vnode_refed);
+						ASSERT(!(lock_flags &
+							XFS_IOLOCK_SHARED));
+						continue;
+					}
+
+					ASSERT(ip->i_mount == mp);
+
+					if (xfs_ilock_nowait(ip,
+						    XFS_ILOCK_SHARED) == 0) {
+						ASSERT(ip->i_mount == mp);
+						/*
+						 * We failed to reacquire
+						 * the inode lock without
+						 * sleeping, so just skip
+						 * the inode for now.  We
+						 * clear the ILOCK bit from
+						 * the lock_flags so that we
+						 * won't try to drop a lock
+						 * we don't hold below.
+						 */
+						lock_flags &= ~XFS_ILOCK_SHARED;
+						IPOINTER_REMOVE(ip_next, mp);
+					} else if ((xfs_ipincount(ip) == 0) &&
+						   xfs_iflock_nowait(ip)) {
+						ASSERT(ip->i_mount == mp);
+						/*
+						 * Since this is vfs_sync()
+						 * calling we only flush the
+						 * inode out if we can lock
+						 * it without sleeping and
+						 * it is not pinned.  Drop
+						 * the mount lock here so
+						 * that we don't hold it for
+						 * too long. We already have
+						 * a marker in the list here.
+						 */
+						XFS_MOUNT_IUNLOCK(mp);
+						mount_locked = B_FALSE;
+						error = xfs_iflush(ip,
+							   XFS_IFLUSH_DELWRI);
+					} else {
+						ASSERT(ip->i_mount == mp);
+						IPOINTER_REMOVE(ip_next, mp);
+					}
+				}
+
+			}
+
+		} else {
+			if ((flags & SYNC_ATTR) &&
+			    ((ip->i_update_core) ||
+			     ((ip->i_itemp != NULL) &&
+			      (ip->i_itemp->ili_format.ilf_fields != 0)))) {
+				if (mount_locked) {
+					IPOINTER_INSERT(ip, mp);
+				}
+
+				if (flags & SYNC_WAIT) {
+					xfs_iflock(ip);
+					error = xfs_iflush(ip,
+							   XFS_IFLUSH_SYNC);
+				} else {
+					/*
+					 * If we can't acquire the flush
+					 * lock, then the inode is already
+					 * being flushed so don't bother
+					 * waiting.  If we can lock it then
+					 * do a delwri flush so we can
+					 * combine multiple inode flushes
+					 * in each disk write.
+					 */
+					if (xfs_iflock_nowait(ip)) {
+						error = xfs_iflush(ip,
+							   XFS_IFLUSH_DELWRI);
+					}
+					else if (bypassed)
+						(*bypassed)++;
+				}
+			}
+		}
+
+		if (lock_flags != 0) {
+			xfs_iunlock(ip, lock_flags);
+		}
+
+		if (vnode_refed) {
+			/*
+			 * If we had to take a reference on the vnode
+			 * above, then wait until after we've unlocked
+			 * the inode to release the reference.	This is
+			 * because we can be already holding the inode
+			 * lock when VN_RELE() calls xfs_inactive().
+			 *
+			 * Make sure to drop the mount lock before calling
+			 * VN_RELE() so that we don't trip over ourselves if
+			 * we have to go for the mount lock again in the
+			 * inactive code.
+			 */
+			if (mount_locked) {
+				IPOINTER_INSERT(ip, mp);
+			}
+
+			VN_RELE(vp);
+
+			vnode_refed = B_FALSE;
+		}
+
+		if (error) {
+			last_error = error;
+		}
+
+		/*
+		 * bail out if the filesystem is corrupted.
+		 */
+		if (error == EFSCORRUPTED)  {
+			if (!mount_locked) {
+				XFS_MOUNT_ILOCK(mp);
+				IPOINTER_REMOVE(ip, mp);
+			}
+			XFS_MOUNT_IUNLOCK(mp);
+			ASSERT(ipointer_in == B_FALSE);
+			kmem_free(ipointer, sizeof(xfs_iptr_t));
+			return XFS_ERROR(error);
+		}
+
+		/* Let other threads have a chance at the mount lock
+		 * if we have looped many times without dropping the
+		 * lock.
+		 */
+		if ((++preempt & PREEMPT_MASK) == 0) {
+			if (mount_locked) {
+				IPOINTER_INSERT(ip, mp);
+			}
+		}
+
+		if (mount_locked == B_FALSE) {
+			XFS_MOUNT_ILOCK(mp);
+			mount_locked = B_TRUE;
+			IPOINTER_REMOVE(ip, mp);
+			continue;
+		}
+
+		ASSERT(ipointer_in == B_FALSE);
+		ip = ip->i_mnext;
+
+	} while (ip->i_mnext != mp->m_inodes);
+
+	XFS_MOUNT_IUNLOCK(mp);
+
+	ASSERT(ipointer_in == B_FALSE);
+
+	/*
+	 * Get the Quota Manager to flush the dquots in a similar manner.
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if ((error = xfs_qm_sync(mp, flags))) {
+			/*
+			 * If we got an IO error, we will be shutting down.
+			 * So, there's nothing more for us to do here.
+			 */
+			ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp));
+			if (XFS_FORCED_SHUTDOWN(mp)) {
+				kmem_free(ipointer, sizeof(xfs_iptr_t));
+				return XFS_ERROR(error);
+			}
+		}
+	}
+
+	/*
+	 * Flushing out dirty data above probably generated more
+	 * log activity, so if this isn't vfs_sync() then flush
+	 * the log again.  If SYNC_WAIT is set then do it synchronously.
+	 */
+	if (!(flags & SYNC_BDFLUSH)) {
+		log_flags = XFS_LOG_FORCE;
+		if (flags & SYNC_WAIT) {
+			log_flags |= XFS_LOG_SYNC;
+		}
+		xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
+	}
+
+	if (flags & SYNC_FSDATA) {
+		/*
+		 * If this is vfs_sync() then only sync the superblock
+		 * if we can lock it without sleeping and it is not pinned.
+		 */
+		if (flags & SYNC_BDFLUSH) {
+			bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
+			if (bp != NULL) {
+				bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
+				if ((bip != NULL) &&
+				    xfs_buf_item_dirty(bip)) {
+					if (!(XFS_BUF_ISPINNED(bp))) {
+						XFS_BUF_ASYNC(bp);
+						error = xfs_bwrite(mp, bp);
+					} else {
+						xfs_buf_relse(bp);
+					}
+				} else {
+					xfs_buf_relse(bp);
+				}
+			}
+		} else {
+			bp = xfs_getsb(mp, 0);
+			/*
+			 * If the buffer is pinned then push on the log so
+			 * we won't get stuck waiting in the write for
+			 * someone, maybe ourselves, to flush the log.
+			 * Even though we just pushed the log above, we
+			 * did not have the superblock buffer locked at
+			 * that point so it can become pinned in between
+			 * there and here.
+			 */
+			if (XFS_BUF_ISPINNED(bp)) {
+				xfs_log_force(mp, (xfs_lsn_t)0,
+					      XFS_LOG_FORCE);
+			}
+			XFS_BUF_BFLAGS(bp) |= fflag;
+			error = xfs_bwrite(mp, bp);
+		}
+		if (error) {
+			last_error = error;
+		}
+	}
+
+	/*
+	 * If this is the 30 second sync, then kick some entries out of
+	 * the reference cache.	 This ensures that idle entries are
+	 * eventually kicked out of the cache.
+	 */
+	if (flags & SYNC_BDFLUSH) {
+		xfs_refcache_purge_some(mp);
+	}
+
+	/*
+	 * Now check to see if the log needs a "dummy" transaction.
+	 */
+
+	if (xfs_log_need_covered(mp)) {
+		xfs_trans_t *tp;
+
+		/*
+		 * Put a dummy transaction in the log to tell
+		 * recovery that all others are OK.
+		 */
+		tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
+		if ((error = xfs_trans_reserve(tp, 0,
+				XFS_ICHANGE_LOG_RES(mp),
+				0, 0, 0)))  {
+			xfs_trans_cancel(tp, 0);
+			kmem_free(ipointer, sizeof(xfs_iptr_t));
+			return error;
+		}
+
+		ip = mp->m_rootip;
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+		xfs_trans_ihold(tp, ip);
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+		error = xfs_trans_commit(tp, 0, NULL);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+
+	/*
+	 * When shutting down, we need to insure that the AIL is pushed
+	 * to disk or the filesystem can appear corrupt from the PROM.
+	 */
+	if ((flags & (SYNC_CLOSE|SYNC_WAIT)) == (SYNC_CLOSE|SYNC_WAIT)) {
+		XFS_bflush(mp->m_ddev_targp);
+		if (mp->m_rtdev_targp) {
+			XFS_bflush(mp->m_rtdev_targp);
+		}
+	}
+
+	kmem_free(ipointer, sizeof(xfs_iptr_t));
+	return XFS_ERROR(last_error);
+}
+
+
+/*
+ * xfs_vget - called by DMAPI to get vnode from file handle
+ */
+STATIC int
+xfs_vget(
+	bhv_desc_t	*bdp,
+	vnode_t		**vpp,
+	fid_t		*fidp)
+{
+	xfs_fid_t	*xfid;
+	xfs_inode_t	*ip;
+	int		error;
+	xfs_ino_t	ino;
+	unsigned int	igen;
+	xfs_mount_t	*mp;
+	struct inode	*inode = NULL;
+
+	xfid  = (struct xfs_fid *)fidp;
+
+	if (xfid->xfs_fid_len == sizeof(*xfid) - sizeof(xfid->xfs_fid_len)) {
+		ino  = xfid->xfs_fid_ino;
+		igen = xfid->xfs_fid_gen;
+	} else {
+		/*
+		 * Invalid.  Since handles can be created in user space
+		 * and passed in via gethandle(), this is not cause for
+		 * a panic.
+		 */
+		return XFS_ERROR(EINVAL);
+	}
+	mp = XFS_BHVTOM(bdp);
+	error = xfs_iget(mp, NULL, ino, XFS_ILOCK_SHARED, &ip, 0);
+	if (error) {
+		*vpp = NULL;
+		return error;
+	}
+	if (ip == NULL) {
+		*vpp = NULL;
+		return XFS_ERROR(EIO);
+	}
+
+	if (ip->i_d.di_mode == 0 || (igen && (ip->i_d.di_gen != igen))) {
+		xfs_iput_new(ip, XFS_ILOCK_SHARED);
+		*vpp = NULL;
+		return XFS_ERROR(ENOENT);
+	}
+
+	*vpp = XFS_ITOV(ip);
+	inode = LINVFS_GET_IP((*vpp));
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	error = linvfs_revalidate_core(inode, ATTR_COMM);
+	if (error) {
+		iput(inode);
+		return XFS_ERROR(error);
+	}
+	return 0;
+}
+
+
+vfsops_t xfs_vfsops = {
+	.vfs_mount		= xfs_mount,
+	.vfs_dounmount		= fs_dounmount,
+	.vfs_unmount		= xfs_unmount,
+	.vfs_root		= xfs_root,
+	.vfs_statvfs		= xfs_statvfs,
+	.vfs_sync		= xfs_sync,
+	.vfs_vget		= xfs_vget,
+	.vfs_force_shutdown	= xfs_do_force_shutdown,
+#ifdef CONFIG_XFS_DMAPI
+	.vfs_dmapi_mount	= xfs_dm_mount,
+	.vfs_dmapi_fsys_vector	= xfs_dm_get_fsys_vector,
+#endif
+};
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
new file mode 100644
index 000000000000..d6cdbef91298
--- /dev/null
+++ b/fs/xfs/xfs_vnodeops.c
@@ -0,0 +1,5265 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.	 Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include <xfs.h>
+#include <asm/fcntl.h>
+
+
+/*
+ * The maximum pathlen is 1024 bytes. Since the minimum file system
+ * blocksize is 512 bytes, we can get a max of 2 extents back from
+ * bmapi.
+ */
+#define SYMLINK_MAPS 2
+
+extern int xfs_ioctl(bhv_desc_t *, struct inode *, struct file *,
+			unsigned int, unsigned long);
+
+
+#ifdef XFS_RW_TRACE
+STATIC void
+xfs_ctrunc_trace(
+	int		tag,
+	xfs_inode_t	*ip);
+#else
+#define xfs_ctrunc_trace(tag, ip)
+#endif /* DEBUG */
+
+/*
+ * For xfs, we check that the file isn't too big to be opened by this kernel.
+ * No other open action is required for regular files.	Devices are handled
+ * through the specfs file system, pipes through fifofs.  Device and
+ * fifo vnodes are "wrapped" by specfs and fifofs vnodes, respectively,
+ * when a new vnode is first looked up or created.
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_open(
+	bhv_desc_t	*bdp,
+	cred_t		*credp)
+{
+	int		mode;
+	vnode_t		*vp;
+	xfs_inode_t	*ip;
+
+	vp = BHV_TO_VNODE(bdp);
+	ip = XFS_BHVTOI(bdp);
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return XFS_ERROR(EIO);
+
+	/*
+	 * If it's a directory with any blocks, read-ahead block 0
+	 * as we're almost certain to have the next operation be a read there.
+	 */
+	if (vp->v_type == VDIR && ip->i_d.di_nextents > 0) {
+		mode = xfs_ilock_map_shared(ip);
+		(void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
+		xfs_iunlock(ip, mode);
+	}
+	return 0;
+}
+
+
+/*
+ * xfs_getattr
+ */
+/*ARGSUSED*/
+int
+xfs_getattr(
+	bhv_desc_t	*bdp,
+	vattr_t		*vap,
+	int		flags,
+	cred_t		*credp)
+{
+	xfs_inode_t	*ip;
+	xfs_mount_t	*mp;
+	vnode_t		*vp;
+
+	vp  = BHV_TO_VNODE(bdp);
+
+	vn_trace_entry(vp, "xfs_getattr", (inst_t *)__return_address);
+
+	ip = XFS_BHVTOI(bdp);
+	mp = ip->i_mount;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	if (!(flags & ATTR_LAZY))
+		xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+	vap->va_size = ip->i_d.di_size;
+	if (vap->va_mask == AT_SIZE) {
+		if (!(flags & ATTR_LAZY))
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		return 0;
+	}
+	vap->va_nblocks =
+		XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
+	vap->va_fsid = mp->m_dev;
+#if XFS_BIG_FILESYSTEMS
+	vap->va_nodeid = ip->i_ino + mp->m_inoadd;
+#else
+	vap->va_nodeid = ip->i_ino;
+#endif
+	vap->va_nlink = ip->i_d.di_nlink;
+
+	/*
+	 * Quick exit for non-stat callers
+	 */
+	if ((vap->va_mask & ~(AT_SIZE|AT_FSID|AT_NODEID|AT_NLINK)) == 0) {
+		if (!(flags & ATTR_LAZY))
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		return 0;
+	}
+
+	/*
+	 * Copy from in-core inode.
+	 */
+	vap->va_type = vp->v_type;
+	vap->va_mode = ip->i_d.di_mode & MODEMASK;
+	vap->va_uid = ip->i_d.di_uid;
+	vap->va_gid = ip->i_d.di_gid;
+	vap->va_projid = ip->i_d.di_projid;
+
+	/*
+	 * Check vnode type block/char vs. everything else.
+	 * Do it with bitmask because that's faster than looking
+	 * for multiple values individually.
+	 */
+	if (((1 << vp->v_type) & ((1<<VBLK) | (1<<VCHR))) == 0) {
+		vap->va_rdev = 0;
+
+		if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
+
+#if 0
+			/* Large block sizes confuse various
+			 * user space programs, so letting the
+			 * stripe size through is not a good
+			 * idea for now.
+			 */
+			vap->va_blksize = mp->m_swidth ?
+				/*
+				 * If the underlying volume is a stripe, then
+				 * return the stripe width in bytes as the
+				 * recommended I/O size.
+				 */
+				(mp->m_swidth << mp->m_sb.sb_blocklog) :
+				/*
+				 * Return the largest of the preferred buffer
+				 * sizes since doing small I/Os into larger
+				 * buffers causes buffers to be decommissioned.
+				 * The value returned is in bytes.
+				 */
+				(1 << (int)MAX(mp->m_readio_log,
+					       mp->m_writeio_log));
+
+#else
+			vap->va_blksize =
+				/*
+				 * Return the largest of the preferred buffer
+				 * sizes since doing small I/Os into larger
+				 * buffers causes buffers to be decommissioned.
+				 * The value returned is in bytes.
+				 */
+				1 << (int)MAX(mp->m_readio_log,
+					       mp->m_writeio_log);
+#endif
+		} else {
+
+			/*
+			 * If the file blocks are being allocated from a
+			 * realtime partition, then return the inode's
+			 * realtime extent size or the realtime volume's
+			 * extent size.
+			 */
+			vap->va_blksize = ip->i_d.di_extsize ?
+				(ip->i_d.di_extsize << mp->m_sb.sb_blocklog) :
+				(mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog);
+		}
+	} else {
+		vap->va_rdev = IRIX_DEV_TO_KDEVT(ip->i_df.if_u2.if_rdev);
+		vap->va_blksize = BLKDEV_IOSIZE;
+	}
+
+	vap->va_atime.tv_sec = ip->i_d.di_atime.t_sec;
+	vap->va_atime.tv_nsec = ip->i_d.di_atime.t_nsec;
+	vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
+	vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
+	vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
+	vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
+
+	/*
+	 * Exit for stat callers.  See if any of the rest of the fields
+	 * to be filled in are needed.
+	 */
+	if ((vap->va_mask &
+	     (AT_XFLAGS|AT_EXTSIZE|AT_NEXTENTS|AT_ANEXTENTS|
+	      AT_GENCOUNT|AT_VCODE)) == 0) {
+		if (!(flags & ATTR_LAZY))
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		return 0;
+	}
+	/*
+	 * convert di_flags to xflags
+	 */
+	vap->va_xflags =
+		((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
+			XFS_XFLAG_REALTIME : 0) |
+		((ip->i_d.di_flags & XFS_DIFLAG_PREALLOC) ?
+			XFS_XFLAG_PREALLOC : 0);
+	vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
+	vap->va_nextents =
+		(ip->i_df.if_flags & XFS_IFEXTENTS) ?
+			ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
+			ip->i_d.di_nextents;
+	if (ip->i_afp != NULL)
+		vap->va_anextents =
+			(ip->i_afp->if_flags & XFS_IFEXTENTS) ?
+				ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
+				 ip->i_d.di_anextents;
+	else
+		vap->va_anextents = 0;
+	vap->va_gencount = ip->i_d.di_gen;
+	vap->va_vcode = 0L;
+
+	if (!(flags & ATTR_LAZY))
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	return 0;
+}
+
+
+/*
+ * xfs_setattr
+ */
+STATIC int
+xfs_setattr(
+	bhv_desc_t	*bdp,
+	vattr_t		*vap,
+	int		flags,
+	cred_t		*credp)
+{
+	xfs_inode_t	*ip;
+	xfs_trans_t	*tp;
+	xfs_mount_t	*mp;
+	int		mask;
+	int		code;
+	uint		lock_flags;
+	uint		commit_flags=0;
+	uid_t		uid=0, iuid=0;
+	gid_t		gid=0, igid=0;
+	int		timeflags = 0;
+	vnode_t		*vp;
+	xfs_prid_t	projid=0, iprojid=0;
+	int		privileged;
+	int		mandlock_before, mandlock_after;
+	uint		qflags;
+	struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2;
+	int		file_owner;
+
+	vp = BHV_TO_VNODE(bdp);
+
+	vn_trace_entry(vp, "xfs_setattr", (inst_t *)__return_address);
+	/*
+	 * Cannot set certain attributes.
+	 */
+	mask = vap->va_mask;
+	if (mask & AT_NOSET) {
+		return XFS_ERROR(EINVAL);
+	}
+
+	ip = XFS_BHVTOI(bdp);
+	mp = ip->i_mount;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	/*
+	 * Timestamps do not need to be logged and hence do not
+	 * need to be done within a transaction.
+	 */
+	if (mask & AT_UPDTIMES) {
+		ASSERT((mask & ~AT_UPDTIMES) == 0);
+		timeflags = ((mask & AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
+			    ((mask & AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
+			    ((mask & AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
+		xfs_ichgtime(ip, timeflags);
+		return 0;
+	}
+
+	olddquot1 = olddquot2 = NULL;
+	udqp = gdqp = NULL;
+
+	/*
+	 * If disk quotas is on, we make sure that the dquots do exist on disk,
+	 * before we start any other transactions. Trying to do this later
+	 * is messy. We don't care to take a readlock to look at the ids
+	 * in inode here, because we can't hold it across the trans_reserve.
+	 * If the IDs do change before we take the ilock, we're covered
+	 * because the i_*dquot fields will get updated anyway.
+	 */
+	if (XFS_IS_QUOTA_ON(mp) && (mask & (AT_UID|AT_GID))) {
+		qflags = 0;
+		if (mask & AT_UID) {
+			uid = vap->va_uid;
+			qflags |= XFS_QMOPT_UQUOTA;
+		} else {
+			uid = ip->i_d.di_uid;
+		}
+		if (mask & AT_GID) {
+			gid = vap->va_gid;
+			qflags |= XFS_QMOPT_GQUOTA;
+		}  else {
+			gid = ip->i_d.di_gid;
+		}
+		/*
+		 * We take a reference when we initialize udqp and gdqp,
+		 * so it is important that we never blindly double trip on
+		 * the same variable. See xfs_create() for an example.
+		 */
+		ASSERT(udqp == NULL);
+		ASSERT(gdqp == NULL);
+		if ((code = xfs_qm_vop_dqalloc(mp, ip, uid, gid, qflags,
+						    &udqp, &gdqp)))
+			return (code);
+	}
+
+	/*
+	 * For the other attributes, we acquire the inode lock and
+	 * first do an error checking pass.
+	 */
+	tp = NULL;
+	lock_flags = XFS_ILOCK_EXCL;
+	if (!(mask & AT_SIZE)) {
+		tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+		commit_flags = 0;
+		if ((code = xfs_trans_reserve(tp, 0,
+					     XFS_ICHANGE_LOG_RES(mp), 0,
+					     0, 0))) {
+			lock_flags = 0;
+			goto error_return;
+		}
+	} else {
+		if (DM_EVENT_ENABLED (vp->v_vfsp, ip, DM_EVENT_TRUNCATE) &&
+		    !(flags & ATTR_DMI)) {
+			code = xfs_dm_send_data_event (DM_EVENT_TRUNCATE, bdp,
+				vap->va_size, 0, AT_DELAY_FLAG(flags), NULL);
+			if (code) {
+				lock_flags = 0;
+				goto error_return;
+			}
+		}
+		lock_flags |= XFS_IOLOCK_EXCL;
+	}
+
+	xfs_ilock(ip, lock_flags);
+
+	if (_MAC_XFS_IACCESS(ip, MACWRITE, credp)) {
+		code = XFS_ERROR(EACCES);
+		goto error_return;
+	}
+
+	/* boolean: are we the file owner? */
+	file_owner = (current->fsuid == ip->i_d.di_uid);
+
+	/*
+	 * Change various properties of a file.
+	 * Only the owner or users with CAP_FOWNER
+	 * capability may do these things.
+	 */
+	if (mask & (AT_MODE|AT_XFLAGS|AT_EXTSIZE|AT_UID|AT_GID|AT_PROJID)) {
+		/*
+		 * CAP_FOWNER overrides the following restrictions:
+		 *
+		 * The user ID of the calling process must be equal
+		 * to the file owner ID, except in cases where the
+		 * CAP_FSETID capability is applicable.
+		 */
+		if (!file_owner && !capable(CAP_FOWNER)) {
+			code = XFS_ERROR(EPERM);
+			goto error_return;
+		}
+
+		/*
+		 * CAP_FSETID overrides the following restrictions:
+		 *
+		 * The effective user ID of the calling process shall match
+		 * the file owner when setting the set-user-ID and
+		 * set-group-ID bits on that file.
+		 *
+		 * The effective group ID or one of the supplementary group
+		 * IDs of the calling process shall match the group owner of
+		 * the file when setting the set-group-ID bit on that file
+		 */
+		if (mask & AT_MODE) {
+			mode_t m = 0;
+
+			if ((vap->va_mode & ISUID) && !file_owner)
+				m |= ISUID;
+			if ((vap->va_mode & ISGID) &&
+			    !in_group_p((gid_t)ip->i_d.di_gid))
+				m |= ISGID;
+#if 0
+			/* Linux allows this, Irix doesn't. */
+			if ((vap->va_mode & ISVTX) && vp->v_type != VDIR)
+				m |= ISVTX;
+#endif
+			if (m && !capable(CAP_FSETID))
+				vap->va_mode &= ~m;
+		}
+	}
+
+	/*
+	 * Change file ownership.  Must be the owner or privileged.
+	 * If the system was configured with the "restricted_chown"
+	 * option, the owner is not permitted to give away the file,
+	 * and can change the group id only to a group of which he
+	 * or she is a member.
+	 */
+	if (mask & (AT_UID|AT_GID|AT_PROJID)) {
+		/*
+		 * These IDs could have changed since we last looked at them.
+		 * But, we're assured that if the ownership did change
+		 * while we didn't have the inode locked, inode's dquot(s)
+		 * would have changed also.
+		 */
+		iuid = ip->i_d.di_uid;
+		iprojid = ip->i_d.di_projid;
+		igid = ip->i_d.di_gid;
+		gid = (mask & AT_GID) ? vap->va_gid : igid;
+		uid = (mask & AT_UID) ? vap->va_uid : iuid;
+		projid = (mask & AT_PROJID) ? (xfs_prid_t)vap->va_projid :
+			 iprojid;
+
+		/*
+		 * CAP_CHOWN overrides the following restrictions:
+		 *
+		 * If _POSIX_CHOWN_RESTRICTED is defined, this capability
+		 * shall override the restriction that a process cannot
+		 * change the user ID of a file it owns and the restriction
+		 * that the group ID supplied to the chown() function
+		 * shall be equal to either the group ID or one of the
+		 * supplementary group IDs of the calling process.
+		 *
+		 * XXX: How does restricted_chown affect projid?
+		 */
+		if (restricted_chown &&
+		    (iuid != uid || (igid != gid &&
+				     !in_group_p((gid_t)gid))) &&
+		    !capable(CAP_CHOWN)) {
+			code = XFS_ERROR(EPERM);
+			goto error_return;
+		}
+		/*
+		 * Do a quota reservation only if uid or gid is actually
+		 * going to change.
+		 */
+		if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
+		    (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
+			ASSERT(tp);
+			/*
+			 * XXX:casey - This may result in unnecessary auditing.
+			 */
+			privileged = capable(CAP_FOWNER);
+			if ((code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
+							  privileged ?
+							  XFS_QMOPT_FORCE_RES :
+							  0)))
+				/* out of quota */
+				goto error_return;
+		}
+	}
+
+	/*
+	 * Truncate file.  Must have write permission and not be a directory.
+	 */
+	if (mask & AT_SIZE) {
+		/* Short circuit the truncate case for zero length files */
+		if ((vap->va_size == 0) &&
+		   (ip->i_d.di_size == 0) && (ip->i_d.di_nextents == 0)) {
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+			lock_flags &= ~XFS_ILOCK_EXCL;
+			if (mask & AT_CTIME)
+				xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+			code = 0;
+			goto error_return;
+		}
+
+		if (vp->v_type == VDIR) {
+			code = XFS_ERROR(EISDIR);
+			goto error_return;
+		} else if (vp->v_type != VREG) {
+			code = XFS_ERROR(EINVAL);
+			goto error_return;
+		}
+		/*
+		 * Make sure that the dquots are attached to the inode.
+		 */
+		if (XFS_IS_QUOTA_ON(mp) && XFS_NOT_DQATTACHED(mp, ip)) {
+			if ((code = xfs_qm_dqattach(ip, XFS_QMOPT_ILOCKED)))
+				goto error_return;
+		}
+	}
+
+	/*
+	 * Change file access or modified times.
+	 */
+	if (mask & (AT_ATIME|AT_MTIME)) {
+		if (!file_owner) {
+			if ((flags & ATTR_UTIME) &&
+			    !capable(CAP_FOWNER)) {
+				code = XFS_ERROR(EPERM);
+				goto error_return;
+			}
+		}
+	}
+
+	/*
+	 * Change extent size or realtime flag.
+	 */
+	if (mask & (AT_EXTSIZE|AT_XFLAGS)) {
+		/*
+		 * Can't change extent size if any extents are allocated.
+		 */
+		if (ip->i_d.di_nextents && (mask & AT_EXTSIZE) &&
+		    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
+		     vap->va_extsize) ) {
+			code = XFS_ERROR(EINVAL);	/* EFBIG? */
+			goto error_return;
+		}
+
+		/*
+		 * Can't set extent size unless the file is marked, or
+		 * about to be marked as a realtime file.
+		 *
+		 * This check will be removed when fixed size extents
+		 * with buffered data writes is implemented.
+		 *
+		 */
+		if ((mask & AT_EXTSIZE)				&&
+		    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
+		     vap->va_extsize) &&
+		    (!((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
+		       ((mask & AT_XFLAGS) &&
+			(vap->va_xflags & XFS_XFLAG_REALTIME))))) {
+			code = XFS_ERROR(EINVAL);
+			goto error_return;
+		}
+
+		/*
+		 * Can't change realtime flag if any extents are allocated.
+		 */
+		if (ip->i_d.di_nextents && (mask & AT_XFLAGS) &&
+		    (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
+		    (vap->va_xflags & XFS_XFLAG_REALTIME)) {
+			code = XFS_ERROR(EINVAL);	/* EFBIG? */
+			goto error_return;
+		}
+		/*
+		 * Extent size must be a multiple of the appropriate block
+		 * size, if set at all.
+		 */
+		if ((mask & AT_EXTSIZE) && vap->va_extsize != 0) {
+			xfs_extlen_t	size;
+
+			if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
+			    ((mask & AT_XFLAGS) &&
+			    (vap->va_xflags & XFS_XFLAG_REALTIME))) {
+				size = mp->m_sb.sb_rextsize <<
+				       mp->m_sb.sb_blocklog;
+			} else {
+				size = mp->m_sb.sb_blocksize;
+			}
+			if (vap->va_extsize % size) {
+				code = XFS_ERROR(EINVAL);
+				goto error_return;
+			}
+		}
+		/*
+		 * If realtime flag is set then must have realtime data.
+		 */
+		if ((mask & AT_XFLAGS) &&
+		    (vap->va_xflags & XFS_XFLAG_REALTIME)) {
+			if ((mp->m_sb.sb_rblocks == 0) ||
+			    (mp->m_sb.sb_rextsize == 0) ||
+			    (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
+				code = XFS_ERROR(EINVAL);
+				goto error_return;
+			}
+		}
+	}
+
+	/*
+	 * Now we can make the changes.	 Before we join the inode
+	 * to the transaction, if AT_SIZE is set then take care of
+	 * the part of the truncation that must be done without the
+	 * inode lock.	This needs to be done before joining the inode
+	 * to the transaction, because the inode cannot be unlocked
+	 * once it is a part of the transaction.
+	 */
+	if (mask & AT_SIZE) {
+		if (vap->va_size > ip->i_d.di_size) {
+			code = xfs_igrow_start(ip, vap->va_size, credp);
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		} else if (vap->va_size < ip->i_d.di_size) {
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+			xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
+					    (xfs_fsize_t)vap->va_size);
+			code = 0;
+		} else {
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+			code = 0;
+		}
+		if (code) {
+			ASSERT(tp == NULL);
+			lock_flags &= ~XFS_ILOCK_EXCL;
+			ASSERT(lock_flags == XFS_IOLOCK_EXCL);
+			goto error_return;
+		}
+		tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
+		if ((code = xfs_trans_reserve(tp, 0,
+					     XFS_ITRUNCATE_LOG_RES(mp), 0,
+					     XFS_TRANS_PERM_LOG_RES,
+					     XFS_ITRUNCATE_LOG_COUNT))) {
+			xfs_trans_cancel(tp, 0);
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+			return code;
+		}
+		commit_flags = XFS_TRANS_RELEASE_LOG_RES;
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+	}
+
+	xfs_trans_ijoin(tp, ip, lock_flags);
+	xfs_trans_ihold(tp, ip);
+
+	/* determine whether mandatory locking mode changes */
+	mandlock_before = MANDLOCK(vp, ip->i_d.di_mode);
+
+	/*
+	 * Truncate file.  Must have write permission and not be a directory.
+	 */
+	if (mask & AT_SIZE) {
+		if (vap->va_size > ip->i_d.di_size) {
+			xfs_igrow_finish(tp, ip, vap->va_size,
+			    !(flags & ATTR_DMI));
+		} else if ((vap->va_size < ip->i_d.di_size) ||
+			   ((vap->va_size == 0) && ip->i_d.di_nextents)) {
+			/*
+			 * signal a sync transaction unless
+			 * we're truncating an already unlinked
+			 * file on a wsync filesystem
+			 */
+			code = xfs_itruncate_finish(&tp, ip,
+					    (xfs_fsize_t)vap->va_size,
+					    XFS_DATA_FORK,
+					    ((ip->i_d.di_nlink != 0 ||
+					      !(mp->m_flags & XFS_MOUNT_WSYNC))
+					     ? 1 : 0));
+			if (code) {
+				goto abort_return;
+			}
+		}
+		/*
+		 * Have to do this even if the file's size doesn't change.
+		 */
+		timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+	}
+
+	/*
+	 * Change file access modes.
+	 */
+	if (mask & AT_MODE) {
+		ip->i_d.di_mode &= IFMT;
+		ip->i_d.di_mode |= vap->va_mode & ~IFMT;
+
+		xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
+		timeflags |= XFS_ICHGTIME_CHG;
+	}
+
+	/*
+	 * Change file ownership.  Must be the owner or privileged.
+	 * If the system was configured with the "restricted_chown"
+	 * option, the owner is not permitted to give away the file,
+	 * and can change the group id only to a group of which he
+	 * or she is a member.
+	 */
+	if (mask & (AT_UID|AT_GID|AT_PROJID)) {
+		/*
+		 * CAP_FSETID overrides the following restrictions:
+		 *
+		 * The set-user-ID and set-group-ID bits of a file will be
+		 * cleared upon successful return from chown()
+		 */
+		if ((ip->i_d.di_mode & (ISUID|ISGID)) &&
+		    !capable(CAP_FSETID)) {
+			ip->i_d.di_mode &= ~(ISUID|ISGID);
+		}
+
+		/*
+		 * Change the ownerships and register quota modifications
+		 * in the transaction.
+		 */
+		if (iuid != uid) {
+			if (XFS_IS_UQUOTA_ON(mp)) {
+				ASSERT(mask & AT_UID);
+				ASSERT(udqp);
+				ASSERT(xfs_qm_dqid(udqp) == (xfs_dqid_t)uid);
+				olddquot1 = xfs_qm_vop_chown(tp, ip,
+							     &ip->i_udquot,
+							     udqp);
+				/*
+				 * We'll dqrele olddquot at the end.
+				 */
+			}
+			ip->i_d.di_uid = uid;
+		}
+		if (igid != gid) {
+			if (XFS_IS_GQUOTA_ON(mp)) {
+				ASSERT(mask & AT_GID);
+				ASSERT(gdqp);
+				ASSERT(xfs_qm_dqid(gdqp) == gid);
+				olddquot2 = xfs_qm_vop_chown(tp, ip,
+							     &ip->i_gdquot,
+							     gdqp);
+			}
+			ip->i_d.di_gid = gid;
+		}
+		if (iprojid != projid) {
+			ip->i_d.di_projid = projid;
+			/*
+			 * We may have to rev the inode as well as
+			 * the superblock version number since projids didn't
+			 * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
+			 */
+			if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
+				xfs_bump_ino_vers2(tp, ip);
+		}
+
+		xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
+		timeflags |= XFS_ICHGTIME_CHG;
+	}
+
+
+	/*
+	 * Change file access or modified times.
+	 */
+	if (mask & (AT_ATIME|AT_MTIME)) {
+		if (mask & AT_ATIME) {
+			ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
+			ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
+			ip->i_update_core = 1;
+			timeflags &= ~XFS_ICHGTIME_ACC;
+		}
+		if (mask & AT_MTIME) {
+			ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
+			ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
+			timeflags &= ~XFS_ICHGTIME_MOD;
+			timeflags |= XFS_ICHGTIME_CHG;
+		}
+	}
+
+	/*
+	 * Change XFS-added attributes.
+	 */
+	if (mask & (AT_EXTSIZE|AT_XFLAGS)) {
+		if (mask & AT_EXTSIZE) {
+			/*
+			 * Converting bytes to fs blocks.
+			 */
+			ip->i_d.di_extsize = vap->va_extsize >>
+				mp->m_sb.sb_blocklog;
+		}
+		if (mask & AT_XFLAGS) {
+			ip->i_d.di_flags = 0;
+			if (vap->va_xflags & XFS_XFLAG_REALTIME) {
+				ip->i_d.di_flags |= XFS_DIFLAG_REALTIME;
+				/* This is replicated in the io core for
+				 * CXFS use
+				 */
+				ip->i_iocore.io_flags |= XFS_IOCORE_RT;
+			}
+			/* can't set PREALLOC this way, just ignore it */
+		}
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+		timeflags |= XFS_ICHGTIME_CHG;
+	}
+
+	/*
+	 * Change file inode change time only if AT_CTIME set
+	 * AND we have been called by a DMI function.
+	 */
+
+	if ( (flags & ATTR_DMI) && (mask & AT_CTIME) ) {
+		ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
+		ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
+		ip->i_update_core = 1;
+		timeflags &= ~XFS_ICHGTIME_CHG;
+	}
+
+	/*
+	 * Send out timestamp changes that need to be set to the
+	 * current time.  Not done when called by a DMI function.
+	 */
+	if (timeflags && !(flags & ATTR_DMI))
+		xfs_ichgtime(ip, timeflags);
+
+	XFS_STATS_INC(xfsstats.xs_ig_attrchg);
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * transaction goes to disk before returning to the user.
+	 * This is slightly sub-optimal in that truncates require
+	 * two sync transactions instead of one for wsync filesytems.
+	 * One for the truncate and one for the timestamps since we
+	 * don't want to change the timestamps unless we're sure the
+	 * truncate worked.  Truncates are less than 1% of the laddis
+	 * mix so this probably isn't worth the trouble to optimize.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC)
+		xfs_trans_set_sync(tp);
+
+	code = xfs_trans_commit(tp, commit_flags, NULL);
+
+	/*
+	 * If the (regular) file's mandatory locking mode changed, then
+	 * notify the vnode.  We do this under the inode lock to prevent
+	 * racing calls to vop_vnode_change.
+	 */
+	mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
+	if (mandlock_before != mandlock_after) {
+		VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_ENF_LOCKING,
+				 mandlock_after);
+	}
+
+	xfs_iunlock(ip, lock_flags);
+
+	/*
+	 * release any dquot(s) inode had kept before chown
+	 */
+	if (olddquot1)
+		xfs_qm_dqrele(olddquot1);
+	if (olddquot2)
+		xfs_qm_dqrele(olddquot2);
+	if (udqp)
+		xfs_qm_dqrele(udqp);
+	if (gdqp)
+		xfs_qm_dqrele(gdqp);
+
+	if (code) {
+		return code;
+	}
+
+	if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_ATTRIBUTE) &&
+	    !(flags & ATTR_DMI)) {
+		(void) dm_send_namesp_event (DM_EVENT_ATTRIBUTE, bdp, DM_RIGHT_NULL,
+				NULL, DM_RIGHT_NULL, NULL, NULL,
+			0, 0, AT_DELAY_FLAG(flags));
+	}
+	return 0;
+
+ abort_return:
+	commit_flags |= XFS_TRANS_ABORT;
+	/* FALLTHROUGH */
+ error_return:
+	if (udqp)
+		xfs_qm_dqrele(udqp);
+	if (gdqp)
+		xfs_qm_dqrele(gdqp);
+	if (tp) {
+		xfs_trans_cancel(tp, commit_flags);
+	}
+	if (lock_flags != 0) {
+		xfs_iunlock(ip, lock_flags);
+	}
+	return code;
+} /* xfs_setattr */
+
+
+/*
+ * xfs_access
+ * Null conversion from vnode mode bits to inode mode bits, as in efs.
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_access(
+	bhv_desc_t	*bdp,
+	int		mode,
+	cred_t		*credp)
+{
+	xfs_inode_t	*ip;
+	int		error;
+
+	vn_trace_entry(BHV_TO_VNODE(bdp), "xfs_access",
+					       (inst_t *)__return_address);
+
+	ip = XFS_BHVTOI(bdp);
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	error = xfs_iaccess(ip, mode, credp);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	return error;
+}
+
+
+/*
+ * xfs_readlink
+ *
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_readlink(
+	bhv_desc_t	*bdp,
+	uio_t		*uiop,
+	cred_t		*credp)
+{
+	xfs_inode_t	*ip;
+	int		count;
+	xfs_off_t	offset;
+	int		pathlen;
+	vnode_t		*vp;
+	int		error = 0;
+	xfs_mount_t	*mp;
+	int		nmaps;
+	xfs_bmbt_irec_t mval[SYMLINK_MAPS];
+	xfs_daddr_t	d;
+	int		byte_cnt;
+	int		n;
+	xfs_buf_t	*bp;
+
+	vp = BHV_TO_VNODE(bdp);
+
+	vn_trace_entry(vp, "xfs_readlink", (inst_t *)__return_address);
+
+	ip = XFS_BHVTOI(bdp);
+	mp = ip->i_mount;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+	ASSERT((ip->i_d.di_mode & IFMT) == IFLNK);
+
+	offset = uiop->uio_offset;
+	count = uiop->uio_resid;
+
+	if (offset < 0) {
+		error = XFS_ERROR(EINVAL);
+		goto error_return;
+	}
+	if (count <= 0) {
+		error = 0;
+		goto error_return;
+	}
+
+	if (!(uiop->uio_fmode & FINVIS)) {
+		xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
+	}
+
+	/*
+	 * See if the symlink is stored inline.
+	 */
+	pathlen = (int)ip->i_d.di_size;
+
+	if (ip->i_df.if_flags & XFS_IFINLINE) {
+		error = uiomove(ip->i_df.if_u1.if_data, pathlen, UIO_READ, uiop);
+	}
+	else {
+		/*
+		 * Symlink not inline.	Call bmap to get it in.
+		 */
+		nmaps = SYMLINK_MAPS;
+
+		error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen),
+				  0, NULL, 0, mval, &nmaps, NULL);
+
+		if (error) {
+			goto error_return;
+		}
+
+		for (n = 0; n < nmaps; n++) {
+			d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+			byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+			bp = xfs_buf_read(mp->m_ddev_targp, d,
+				      BTOBB(byte_cnt), 0);
+			error = XFS_BUF_GETERROR(bp);
+			if (error) {
+				xfs_ioerror_alert("xfs_readlink",
+					  ip->i_mount, bp, XFS_BUF_ADDR(bp));
+				xfs_buf_relse(bp);
+				goto error_return;
+			}
+			if (pathlen < byte_cnt)
+				byte_cnt = pathlen;
+			pathlen -= byte_cnt;
+
+			error = uiomove(XFS_BUF_PTR(bp), byte_cnt,
+					 UIO_READ, uiop);
+			xfs_buf_relse (bp);
+		}
+
+	}
+
+
+error_return:
+
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	return error;
+}
+
+/*
+ * xfs_fsync
+ *
+ * This is called to sync the inode and its data out to disk.
+ * We need to hold the I/O lock while flushing the data, and
+ * the inode lock while flushing the inode.  The inode lock CANNOT
+ * be held while flushing the data, so acquire after we're done
+ * with that.
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_fsync(
+	bhv_desc_t	*bdp,
+	int		flag,
+	cred_t		*credp,
+	xfs_off_t	start,
+	xfs_off_t	stop)
+{
+	xfs_inode_t	*ip;
+	int		error;
+	/* REFERENCED */
+	int		error2;
+					/* REFERENCED */
+	int		syncall;
+	vnode_t		*vp;
+	xfs_trans_t	*tp;
+
+	vp = BHV_TO_VNODE(bdp);
+
+	vn_trace_entry(vp, "xfs_fsync", (inst_t *)__return_address);
+
+	ip = XFS_BHVTOI(bdp);
+
+	ASSERT(start >= 0 && stop >= -1);
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return XFS_ERROR(EIO);
+
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+	syncall = error = error2 = 0;
+
+	if (stop == -1)	 {
+		ASSERT(start >= 0);
+		if (start == 0)
+			syncall = 1;
+		stop = xfs_file_last_byte(ip);
+	}
+
+	/*
+	 * If we're invalidating, always flush since we want to
+	 * tear things down.  Otherwise, don't flush anything if
+	 * we're not dirty.
+	 */
+	if (flag & FSYNC_INVAL) {
+		if (ip->i_df.if_flags & XFS_IFEXTENTS &&
+		    ip->i_df.if_bytes > 0) {
+			VOP_FLUSHINVAL_PAGES(vp, start, -1, FI_REMAPF_LOCKED);
+		}
+		ASSERT(syncall == 0 || (VN_CACHED(vp) == 0));
+	} else {
+		/*
+		 * In the non-invalidating case, calls to fsync() do not
+		 * flush all the dirty mmap'd pages.  That requires a
+		 * call to msync().
+		 */
+		VOP_FLUSH_PAGES(vp, start, -1,
+				(flag & FSYNC_WAIT) ? 0 : XFS_B_ASYNC,
+				FI_NONE, error2);
+	}
+
+	if (error2) {
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+		return XFS_ERROR(error2);
+	}
+
+	/*
+	 * We always need to make sure that the required inode state
+	 * is safe on disk.  The vnode might be clean but because
+	 * of committed transactions that haven't hit the disk yet.
+	 * Likewise, there could be unflushed non-transactional
+	 * changes to the inode core that have to go to disk.
+	 *
+	 * The following code depends on one assumption:  that
+	 * any transaction that changes an inode logs the core
+	 * because it has to change some field in the inode core
+	 * (typically nextents or nblocks).  That assumption
+	 * implies that any transactions against an inode will
+	 * catch any non-transactional updates.	 If inode-altering
+	 * transactions exist that violate this assumption, the
+	 * code breaks.	 Right now, it figures that if the involved
+	 * update_* field is clear and the inode is unpinned, the
+	 * inode is clean.  Either it's been flushed or it's been
+	 * committed and the commit has hit the disk unpinning the inode.
+	 * (Note that xfs_inode_item_format() called at commit clears
+	 * the update_* fields.)
+	 */
+	if (!(flag & FSYNC_DATA)) {
+		xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+		if (ip->i_update_core == 0)  {
+			/*
+			 * Timestamps/size haven't changed since last inode
+			 * flush or inode transaction commit.  That means
+			 * either nothing got written or a transaction
+			 * committed which caught the updates.	If the
+			 * latter happened and the transaction hasn't
+			 * hit the disk yet, the inode will be still
+			 * be pinned.  If it is, force the log.
+			 */
+			if (xfs_ipincount(ip) == 0)  {
+				xfs_iunlock(ip, XFS_IOLOCK_EXCL |
+						XFS_ILOCK_SHARED);
+			} else	{
+				xfs_iunlock(ip, XFS_IOLOCK_EXCL |
+						XFS_ILOCK_SHARED);
+				xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
+					      XFS_LOG_FORCE |
+					      ((flag & FSYNC_WAIT)
+					       ? XFS_LOG_SYNC : 0));
+			}
+			error = 0;
+		} else	{
+			/*
+			 * Kick off a transaction to log the inode
+			 * core to get the updates.  Make it
+			 * sync if FSYNC_WAIT is passed in (which
+			 * is done by everybody but specfs).  The
+			 * sync transaction will also force the log.
+			 */
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+			tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
+			if ((error = xfs_trans_reserve(tp, 0,
+					XFS_FSYNC_TS_LOG_RES(ip->i_mount),
+					0, 0, 0)))  {
+				xfs_trans_cancel(tp, 0);
+				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+				return error;
+			}
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+			/*
+			 * Note - it's possible that we might have pushed
+			 * ourselves out of the way during trans_reserve
+			 * which would flush the inode.	 But there's no
+			 * guarantee that the inode buffer has actually
+			 * gone out yet (it's delwri).	Plus the buffer
+			 * could be pinned anyway if it's part of an
+			 * inode in another recent transaction.	 So we
+			 * play it safe and fire off the transaction anyway.
+			 */
+			xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
+			xfs_trans_ihold(tp, ip);
+			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+			if (flag & FSYNC_WAIT)
+				xfs_trans_set_sync(tp);
+			error = xfs_trans_commit(tp, 0, NULL);
+
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
+		}
+	} else {
+		/*
+		 * We don't care about the timestamps here.  We
+		 * only care about the size field growing on us
+		 * and forcing any space allocation transactions.
+		 * We have to flush changes to the size fields
+		 * otherwise we could write out data that
+		 * becomes inaccessible after a crash.
+		 */
+		xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+		if (ip->i_update_size == 0)  {
+			/*
+			 * Force the log if the inode is pinned.
+			 * That ensures that all transactions committed
+			 * against the inode hit the disk.  This may do
+			 * too much work but it's safe.
+			 */
+			if (xfs_ipincount(ip) == 0)  {
+				xfs_iunlock(ip, XFS_IOLOCK_EXCL |
+						XFS_ILOCK_SHARED);
+			} else	{
+				xfs_iunlock(ip, XFS_IOLOCK_EXCL |
+						XFS_ILOCK_SHARED);
+				xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
+					      XFS_LOG_FORCE |
+					      ((flag & FSYNC_WAIT)
+					       ? XFS_LOG_SYNC : 0));
+			}
+			error = 0;
+		} else	{
+			/*
+			 * Kick off a sync transaction to log the inode
+			 * core.  The transaction has to be sync since
+			 * we need these updates to guarantee that the
+			 * data written will be seen.  The sync
+			 * transaction will also force the log.
+			 */
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+			tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
+			if ((error = xfs_trans_reserve(tp, 0,
+					XFS_FSYNC_TS_LOG_RES(ip->i_mount),
+					0, 0, 0)))  {
+				xfs_trans_cancel(tp, 0);
+				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+				return error;
+			}
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+			/*
+			 * Note - it's possible that we might have pushed
+			 * ourselves out of the way during trans_reserve
+			 * which would flush the inode.	 But there's no
+			 * guarantee that the inode buffer has actually
+			 * gone out yet (it's delwri).	Plus the buffer
+			 * could be pinned anyway if it's part of an
+			 * inode in another recent transaction.	 So we
+			 * play it safe and fire off the transaction anyway.
+			 */
+			xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
+			xfs_trans_ihold(tp, ip);
+			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+			if (flag & FSYNC_WAIT)
+				xfs_trans_set_sync(tp);
+			error = xfs_trans_commit(tp, 0, NULL);
+
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL);
+		}
+	}
+	return error;
+}
+
+
+#if 0
+/*
+ * This is a utility routine for xfs_inactive.	It is called when a
+ * transaction attempting to free up the disk space for a file encounters
+ * an error.  It cancels the old transaction and starts up a new one
+ * to be used to free up the inode.  It also sets the inode size and extent
+ * counts to 0 and frees up any memory being used to store inline data,
+ * extents, or btree roots.
+ */
+STATIC void
+xfs_itruncate_cleanup(
+	xfs_trans_t	**tpp,
+	xfs_inode_t	*ip,
+	int		commit_flags,
+	int		fork)
+{
+	xfs_mount_t	*mp;
+	/* REFERENCED */
+	int		error;
+
+	mp = ip->i_mount;
+	if (*tpp) {
+		xfs_trans_cancel(*tpp, commit_flags | XFS_TRANS_ABORT);
+	}
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	*tpp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+	error = xfs_trans_reserve(*tpp, 0, XFS_IFREE_LOG_RES(mp), 0, 0,
+				  XFS_DEFAULT_LOG_COUNT);
+	if (error) {
+		return;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_trans_ijoin(*tpp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	xfs_trans_ihold(*tpp, ip);
+
+	xfs_idestroy_fork(ip, fork);
+
+	if (fork == XFS_DATA_FORK) {
+		ip->i_d.di_nblocks = 0;
+		ip->i_d.di_nextents = 0;
+		ip->i_d.di_size = 0;
+	} else {
+		ip->i_d.di_anextents = 0;
+	}
+	xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
+}
+#endif
+
+/*
+ * This is called by xfs_inactive to free any blocks beyond eof,
+ * when the link count isn't zero.
+ */
+STATIC int
+xfs_inactive_free_eofblocks(
+	xfs_mount_t	*mp,
+	xfs_inode_t	*ip)
+{
+	xfs_trans_t	*tp;
+	int		error;
+	xfs_fileoff_t	end_fsb;
+	xfs_fileoff_t	last_fsb;
+	xfs_filblks_t	map_len;
+	int		nimaps;
+	xfs_bmbt_irec_t imap;
+
+	/*
+	 * Figure out if there are any blocks beyond the end
+	 * of the file.	 If not, then there is nothing to do.
+	 */
+	end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_d.di_size));
+	last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAX_FILE_OFFSET);
+	map_len = last_fsb - end_fsb;
+	if (map_len <= 0)
+		return (0);
+
+	nimaps = 1;
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
+			  NULL, 0, &imap, &nimaps, NULL);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	if (!error && (nimaps != 0) &&
+	    (imap.br_startblock != HOLESTARTBLOCK)) {
+		/*
+		 * Attach the dquots to the inode up front.
+		 */
+		if (XFS_IS_QUOTA_ON(mp) &&
+		    ip->i_ino != mp->m_sb.sb_uquotino &&
+		    ip->i_ino != mp->m_sb.sb_gquotino) {
+			if (XFS_NOT_DQATTACHED(mp, ip)) {
+				if ((error = xfs_qm_dqattach(ip, 0)))
+					return (error);
+			}
+		}
+
+		/*
+		 * There are blocks after the end of file.
+		 * Free them up now by truncating the file to
+		 * its current size.
+		 */
+		tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+
+		/*
+		 * Do the xfs_itruncate_start() call before
+		 * reserving any log space because
+		 * itruncate_start will call into the buffer
+		 * cache and we can't
+		 * do that within a transaction.
+		 */
+		xfs_ilock(ip, XFS_IOLOCK_EXCL);
+		xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
+				    ip->i_d.di_size);
+
+		error = xfs_trans_reserve(tp, 0,
+					  XFS_ITRUNCATE_LOG_RES(mp),
+					  0, XFS_TRANS_PERM_LOG_RES,
+					  XFS_ITRUNCATE_LOG_COUNT);
+		if (error) {
+			ASSERT(XFS_FORCED_SHUTDOWN(mp));
+			xfs_trans_cancel(tp, 0);
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+			return (error);
+		}
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, ip,
+				XFS_IOLOCK_EXCL |
+				XFS_ILOCK_EXCL);
+		xfs_trans_ihold(tp, ip);
+
+		error = xfs_itruncate_finish(&tp, ip,
+					     ip->i_d.di_size,
+					     XFS_DATA_FORK,
+					     0);
+		/*
+		 * If we get an error at this point we
+		 * simply don't bother truncating the file.
+		 */
+		if (error) {
+			xfs_trans_cancel(tp,
+					 (XFS_TRANS_RELEASE_LOG_RES |
+					  XFS_TRANS_ABORT));
+		} else {
+			error = xfs_trans_commit(tp,
+						XFS_TRANS_RELEASE_LOG_RES,
+						NULL);
+		}
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	}
+	return (error);
+}
+
+/*
+ * Free a symlink that has blocks associated with it.
+ */
+STATIC int
+xfs_inactive_symlink_rmt(
+	xfs_inode_t	*ip,
+	xfs_trans_t	**tpp)
+{
+	xfs_buf_t	*bp;
+	int		committed;
+	int		done;
+	int		error;
+	xfs_fsblock_t	first_block;
+	xfs_bmap_free_t free_list;
+	int		i;
+	xfs_mount_t	*mp;
+	xfs_bmbt_irec_t mval[SYMLINK_MAPS];
+	int		nmaps;
+	xfs_trans_t	*ntp;
+	int		size;
+	xfs_trans_t	*tp;
+
+	tp = *tpp;
+	mp = ip->i_mount;
+	ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
+	/*
+	 * We're freeing a symlink that has some
+	 * blocks allocated to it.  Free the
+	 * blocks here.	 We know that we've got
+	 * either 1 or 2 extents and that we can
+	 * free them all in one bunmapi call.
+	 */
+	ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
+	if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		xfs_trans_cancel(tp, 0);
+		*tpp = NULL;
+		return error;
+	}
+	/*
+	 * Lock the inode, fix the size, and join it to the transaction.
+	 * Hold it so in the normal path, we still have it locked for
+	 * the second transaction.  In the error paths we need it
+	 * held so the cancel won't rele it, see below.
+	 */
+	xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	size = (int)ip->i_d.di_size;
+	ip->i_d.di_size = 0;
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_trans_ihold(tp, ip);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	/*
+	 * Find the block(s) so we can inval and unmap them.
+	 */
+	done = 0;
+	XFS_BMAP_INIT(&free_list, &first_block);
+	nmaps = sizeof(mval) / sizeof(mval[0]);
+	if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
+			XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
+			&free_list)))
+		goto error0;
+	/*
+	 * Invalidate the block(s).
+	 */
+	for (i = 0; i < nmaps; i++) {
+		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+			XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
+			XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
+		xfs_trans_binval(tp, bp);
+	}
+	/*
+	 * Unmap the dead block(s) to the free_list.
+	 */
+	if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
+			&first_block, &free_list, &done)))
+		goto error1;
+	ASSERT(done);
+	/*
+	 * Commit the first transaction.  This logs the EFI and the inode.
+	 */
+	if ((error = xfs_bmap_finish(&tp, &free_list, first_block, &committed)))
+		goto error1;
+	/*
+	 * The transaction must have been committed, since there were
+	 * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
+	 * The new tp has the extent freeing and EFDs.
+	 */
+	ASSERT(committed);
+	/*
+	 * The first xact was committed, so add the inode to the new one.
+	 * Mark it dirty so it will be logged and moved forward in the log as
+	 * part of every commit.
+	 */
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_trans_ihold(tp, ip);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	/*
+	 * Get a new, empty transaction to return to our caller.
+	 */
+	ntp = xfs_trans_dup(tp);
+	/*
+	 * Commit the transaction containing extent freeing and EFD's.
+	 * If we get an error on the commit here or on the reserve below,
+	 * we need to unlock the inode since the new transaction doesn't
+	 * have the inode attached.
+	 */
+	error = xfs_trans_commit(tp, 0, NULL);
+	tp = ntp;
+	if (error) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		goto error0;
+	}
+	/*
+	 * Remove the memory for extent descriptions (just bookkeeping).
+	 */
+	if (ip->i_df.if_bytes)
+		xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
+	ASSERT(ip->i_df.if_bytes == 0);
+	/*
+	 * Put an itruncate log reservation in the new transaction
+	 * for our caller.
+	 */
+	if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		goto error0;
+	}
+	/*
+	 * Return with the inode locked but not joined to the transaction.
+	 */
+	*tpp = tp;
+	return 0;
+
+ error1:
+	xfs_bmap_cancel(&free_list);
+ error0:
+	/*
+	 * Have to come here with the inode locked and either
+	 * (held and in the transaction) or (not in the transaction).
+	 * If the inode isn't held then cancel would iput it, but
+	 * that's wrong since this is inactive and the vnode ref
+	 * count is 0 already.
+	 * Cancel won't do anything to the inode if held, but it still
+	 * needs to be locked until the cancel is done, if it was
+	 * joined to the transaction.
+	 */
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	*tpp = NULL;
+	return error;
+
+}
+
+STATIC int
+xfs_inactive_symlink_local(
+	xfs_inode_t	*ip,
+	xfs_trans_t	**tpp)
+{
+	int		error;
+	ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
+	/*
+	 * We're freeing a symlink which fit into
+	 * the inode.  Just free the memory used
+	 * to hold the old symlink.
+	 */
+	error = xfs_trans_reserve(*tpp, 0,
+				  XFS_ITRUNCATE_LOG_RES(ip->i_mount),
+				  0, XFS_TRANS_PERM_LOG_RES,
+				  XFS_ITRUNCATE_LOG_COUNT);
+
+	if (error) {
+		xfs_trans_cancel(*tpp, 0);
+		*tpp = NULL;
+		return (error);
+	}
+	xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+	/*
+	 * Zero length symlinks _can_ exist.
+	 */
+	if (ip->i_df.if_bytes > 0) {
+		xfs_idata_realloc(ip,
+				  -(ip->i_df.if_bytes),
+				  XFS_DATA_FORK);
+		ASSERT(ip->i_df.if_bytes == 0);
+	}
+	return (0);
+}
+
+/*
+ *
+ */
+STATIC int
+xfs_inactive_attrs(
+	xfs_inode_t	*ip,
+	xfs_trans_t	**tpp,
+	int		*commitflags)
+{
+	xfs_trans_t	*tp;
+	int		error;
+	xfs_mount_t	*mp;
+
+	ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
+	tp = *tpp;
+	mp = ip->i_mount;
+	ASSERT(ip->i_d.di_forkoff != 0);
+	xfs_trans_commit(tp, *commitflags, NULL);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	*commitflags = 0;
+
+	error = xfs_attr_inactive(ip);
+	if (error) {
+		*tpp = NULL;
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+		return (error); /* goto out*/
+	}
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+	error = xfs_trans_reserve(tp, 0,
+				  XFS_IFREE_LOG_RES(mp),
+				  0, 0,
+				  XFS_DEFAULT_LOG_COUNT);
+	if (error) {
+		ASSERT(XFS_FORCED_SHUTDOWN(mp));
+		xfs_trans_cancel(tp, 0);
+		*tpp = NULL;
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+		return (error);
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	xfs_trans_ihold(tp, ip);
+	xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+
+	ASSERT(ip->i_d.di_anextents == 0);
+
+	*tpp = tp;
+	return (0);
+}
+
+/*ARGSUSED*/
+STATIC int
+xfs_release(
+	bhv_desc_t	*bdp)
+{
+	xfs_inode_t	*ip;
+	vnode_t		*vp;
+	xfs_mount_t	*mp;
+	int		error;
+
+	vp = BHV_TO_VNODE(bdp);
+	ip = XFS_BHVTOI(bdp);
+
+	if ((vp->v_type != VREG) || (ip->i_d.di_mode == 0)) {
+		return 0;
+	}
+
+	/* If we are in the NFS reference cache then don't do this now */
+	if (ip->i_refcache)
+		return 0;
+
+	mp = ip->i_mount;
+
+	if (ip->i_d.di_nlink != 0) {
+		if ((((ip->i_d.di_mode & IFMT) == IFREG) &&
+		     ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) &&
+		     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
+		    (!(ip->i_d.di_flags & XFS_DIFLAG_PREALLOC))) {
+			if ((error = xfs_inactive_free_eofblocks(mp, ip)))
+				return (error);
+			/* Update linux inode block count after free above */
+			LINVFS_GET_IP(vp)->i_blocks = XFS_FSB_TO_BB(mp,
+				ip->i_d.di_nblocks + ip->i_delayed_blks);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * xfs_inactive
+ *
+ * This is called when the vnode reference count for the vnode
+ * goes to zero.  If the file has been unlinked, then it must
+ * now be truncated.  Also, we clear all of the read-ahead state
+ * kept for the inode here since the file is now closed.
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_inactive(
+	bhv_desc_t	*bdp,
+	cred_t		*credp)
+{
+	xfs_inode_t	*ip;
+			/* REFERENCED */
+	vnode_t		*vp;
+	xfs_trans_t	*tp;
+	xfs_mount_t	*mp;
+	int		error;
+	int		commit_flags;
+	int		truncate;
+
+	vp = BHV_TO_VNODE(bdp);
+
+	vn_trace_entry(vp, "xfs_inactive", (inst_t *)__return_address);
+
+	ip = XFS_BHVTOI(bdp);
+
+	/*
+	 * If the inode is already free, then there can be nothing
+	 * to clean up here.
+	 */
+	if (ip->i_d.di_mode == 0) {
+		ASSERT(ip->i_df.if_real_bytes == 0);
+		ASSERT(ip->i_df.if_broot_bytes == 0);
+		return VN_INACTIVE_CACHE;
+	}
+
+	/*
+	 * Only do a truncate if it's a regular file with
+	 * some actual space in it.  It's OK to look at the
+	 * inode's fields without the lock because we're the
+	 * only one with a reference to the inode.
+	 */
+	truncate = ((ip->i_d.di_nlink == 0) &&
+	    ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents > 0)) &&
+	    ((ip->i_d.di_mode & IFMT) == IFREG));
+
+	mp = ip->i_mount;
+
+	if (ip->i_d.di_nlink == 0 &&
+	    DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_DESTROY)) {
+		(void) dm_send_destroy_event(bdp, DM_RIGHT_NULL);
+	}
+
+	error = 0;
+	if (ip->i_d.di_nlink != 0) {
+		if ((((ip->i_d.di_mode & IFMT) == IFREG) &&
+		     ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) &&
+		     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
+		    (!(ip->i_d.di_flags & XFS_DIFLAG_PREALLOC) ||
+		     (ip->i_delayed_blks != 0))) {
+			if ((error = xfs_inactive_free_eofblocks(mp, ip)))
+				return (VN_INACTIVE_CACHE);
+			/* Update linux inode block count after free above */
+			LINVFS_GET_IP(vp)->i_blocks = XFS_FSB_TO_BB(mp,
+				ip->i_d.di_nblocks + ip->i_delayed_blks);
+		}
+		goto out;
+	}
+
+	ASSERT(ip->i_d.di_nlink == 0);
+
+	if (XFS_IS_QUOTA_ON(mp) &&
+	    ip->i_ino != mp->m_sb.sb_uquotino &&
+	    ip->i_ino != mp->m_sb.sb_gquotino) {
+		if (XFS_NOT_DQATTACHED(mp, ip)) {
+			if ((error = xfs_qm_dqattach(ip, 0)))
+				return (VN_INACTIVE_CACHE);
+		}
+	}
+	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+	if (truncate) {
+		/*
+		 * Do the xfs_itruncate_start() call before
+		 * reserving any log space because itruncate_start
+		 * will call into the buffer cache and we can't
+		 * do that within a transaction.
+		 */
+		xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+		xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
+
+		error = xfs_trans_reserve(tp, 0,
+					  XFS_ITRUNCATE_LOG_RES(mp),
+					  0, XFS_TRANS_PERM_LOG_RES,
+					  XFS_ITRUNCATE_LOG_COUNT);
+		if (error) {
+			/* Don't call itruncate_cleanup */
+			ASSERT(XFS_FORCED_SHUTDOWN(mp));
+			xfs_trans_cancel(tp, 0);
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+			return (VN_INACTIVE_CACHE);
+		}
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+		xfs_trans_ihold(tp, ip);
+
+		/*
+		 * normally, we have to run xfs_itruncate_finish sync.
+		 * But if filesystem is wsync and we're in the inactive
+		 * path, then we know that nlink == 0, and that the
+		 * xaction that made nlink == 0 is permanently committed
+		 * since xfs_remove runs as a synchronous transaction.
+		 */
+		error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
+				(!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
+		commit_flags = XFS_TRANS_RELEASE_LOG_RES;
+
+		if (error) {
+			xfs_trans_cancel(tp, commit_flags | XFS_TRANS_ABORT);
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+			return (VN_INACTIVE_CACHE);
+		}
+	} else if ((ip->i_d.di_mode & IFMT) == IFLNK) {
+
+		/*
+		 * If we get an error while cleaning up a
+		 * symlink we bail out.
+		 */
+		error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
+			xfs_inactive_symlink_rmt(ip, &tp) :
+			xfs_inactive_symlink_local(ip, &tp);
+
+		if (error) {
+			ASSERT(tp == NULL);
+			return (VN_INACTIVE_CACHE);
+		}
+
+		xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+		xfs_trans_ihold(tp, ip);
+		commit_flags = XFS_TRANS_RELEASE_LOG_RES;
+
+	} else {
+		error = xfs_trans_reserve(tp, 0,
+					  XFS_IFREE_LOG_RES(mp),
+					  0, 0,
+					  XFS_DEFAULT_LOG_COUNT);
+		if (error) {
+			ASSERT(XFS_FORCED_SHUTDOWN(mp));
+			xfs_trans_cancel(tp, 0);
+			return (VN_INACTIVE_CACHE);
+		}
+
+		xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+		xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+		xfs_trans_ihold(tp, ip);
+		commit_flags = 0;
+	}
+
+	/*
+	 * If there are attributes associated with the file
+	 * then blow them away now.  The code calls a routine
+	 * that recursively deconstructs the attribute fork.
+	 * We need to just commit the current transaction
+	 * because we can't use it for xfs_attr_inactive().
+	 */
+	if (ip->i_d.di_anextents > 0) {
+		error = xfs_inactive_attrs(ip, &tp, &commit_flags);
+		/*
+		 * If we got an error, the transaction is already
+		 * cancelled, and the inode is unlocked. Just get out.
+		 */
+		 if (error)
+			 return (VN_INACTIVE_CACHE);
+	} else if (ip->i_afp) {
+		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+	}
+
+	/*
+	 * Free the inode.
+	 */
+	error = xfs_ifree(tp, ip);
+	if (error) {
+		/*
+		 * If we fail to free the inode, shut down.  The cancel
+		 * might do that, we need to make sure.	 Otherwise the
+		 * inode might be lost for a long time or forever.
+		 */
+		if (!XFS_FORCED_SHUTDOWN(tp->t_mountp)) {
+			cmn_err(CE_NOTE,
+				"xfs_inactive:	xfs_ifree() returned an error = %d on %s",
+				error,tp->t_mountp->m_fsname);
+			xfs_force_shutdown(tp->t_mountp, XFS_METADATA_IO_ERROR);
+		}
+		xfs_trans_cancel(tp, commit_flags | XFS_TRANS_ABORT);
+	} else {
+		/*
+		 * Credit the quota account(s). The inode is gone.
+		 */
+		if (XFS_IS_QUOTA_ON(tp->t_mountp))
+			xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT,
+							 -1);
+
+		/*
+		 * Just ignore errors at this point.  There is
+		 * nothing we can do except to try to keep going.
+		 */
+		(void) xfs_trans_commit(tp, commit_flags, NULL);
+	}
+	/*
+	 * Release the dquots held by inode, if any.
+	 */
+	if (ip->i_udquot || ip->i_gdquot)
+		xfs_qm_dqdettach_inode(ip);
+
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+
+ out:
+	return VN_INACTIVE_CACHE;
+}
+
+
+/*
+ * xfs_lookup
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_lookup(
+	bhv_desc_t	*dir_bdp,
+	struct dentry	*dentry,
+	vnode_t		**vpp,
+	int		flags,
+	vnode_t		*rdir,
+	cred_t		*credp)
+{
+	xfs_inode_t		*dp, *ip;
+	struct vnode		*vp;
+	xfs_ino_t		e_inum;
+	int			error;
+	uint			lock_mode;
+	uint			lookup_flags;
+	vnode_t			*dir_vp;
+
+	dir_vp = BHV_TO_VNODE(dir_bdp);
+
+	vn_trace_entry(dir_vp, "xfs_lookup", (inst_t *)__return_address);
+
+	dp = XFS_BHVTOI(dir_bdp);
+
+	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+		return XFS_ERROR(EIO);
+
+	lock_mode = xfs_ilock_map_shared(dp);
+
+	lookup_flags = DLF_IGET;
+	if (lock_mode == XFS_ILOCK_SHARED) {
+		lookup_flags |= DLF_LOCK_SHARED;
+	}
+	error = xfs_dir_lookup_int(dir_bdp, lookup_flags, dentry, &e_inum, &ip);
+	if (error) {
+		xfs_iunlock_map_shared(dp, lock_mode);
+		return error;
+	}
+
+	vp = XFS_ITOV(ip);
+
+	ITRACE(ip);
+
+	xfs_iunlock_map_shared(dp, lock_mode);
+
+	*vpp = vp;
+
+	return 0;
+}
+
+#ifdef XFS_RW_TRACE
+STATIC void
+xfs_ctrunc_trace(
+	int		tag,
+	xfs_inode_t	*ip)
+{
+	if (ip->i_rwtrace == NULL) {
+		return;
+	}
+
+	ktrace_enter(ip->i_rwtrace,
+		     (void*)((long)tag),
+		     (void*)ip,
+		     (void*)((long)private.p_cpuid),
+		     (void*)0,
+		     (void*)0,
+		     (void*)0,
+		     (void*)0,
+		     (void*)0,
+		     (void*)0,
+		     (void*)0,
+		     (void*)0,
+		     (void*)0,
+		     (void*)0,
+		     (void*)0,
+		     (void*)0,
+		     (void*)0);
+}
+#endif /* XFS_RW_TRACE */
+
+#define XFS_CREATE_NEW_MAXTRIES 10000
+
+/*
+ * xfs_create (create a new file).
+ *   It might still find name exists out there, though.
+ *	But vpp, doens't point at a vnode.
+ */
+STATIC int
+xfs_create(
+	bhv_desc_t	*dir_bdp,
+	struct dentry	*dentry,
+	vattr_t		*vap,
+	vnode_t		**vpp,
+	cred_t		*credp)
+{
+	char			*name = (char *)dentry->d_name.name;
+	vnode_t			*dir_vp;
+	xfs_inode_t		*dp, *ip;
+	vnode_t			*vp=NULL;
+	xfs_trans_t		*tp;
+	xfs_ino_t		e_inum;
+	xfs_mount_t		*mp;
+	dev_t			rdev;
+	int			error;
+	xfs_bmap_free_t		free_list;
+	xfs_fsblock_t		first_block;
+	boolean_t		dp_joined_to_trans;
+	boolean_t		truncated;
+	boolean_t		created = B_FALSE;
+	boolean_t		inode_change = B_FALSE;
+	int			dm_event_sent = 0;
+	uint			cancel_flags;
+	int			committed;
+	xfs_prid_t		prid;
+	struct xfs_dquot	*udqp, *gdqp;
+	uint			resblks;
+	int			dm_di_mode;
+	int			xfs_create_retries = 0;
+	xfs_ino_t		e_inum_saved;	/* for retry trap code */
+	int			namelen;
+
+	ASSERT(!*vpp);
+	dir_vp = BHV_TO_VNODE(dir_bdp);
+	dp = XFS_BHVTOI(dir_bdp);
+
+	vn_trace_entry(dir_vp, "xfs_create", (inst_t *)__return_address);
+
+	dm_di_mode = vap->va_mode|VTTOIF(vap->va_type);
+	namelen = dentry->d_name.len;
+	if (namelen >= MAXNAMELEN)
+		return XFS_ERROR(ENAMETOOLONG);
+
+	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
+		error = xfs_dm_send_create_event(dir_bdp, name,
+				dm_di_mode, &dm_event_sent);
+		if (error)
+			return error;
+	}
+
+	/* Return through std_return after this point. */
+
+	mp = dp->i_mount;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	udqp = gdqp = NULL;
+	if (vap->va_mask & AT_PROJID)
+		prid = (xfs_prid_t)vap->va_projid;
+	else
+		prid = (xfs_prid_t)dfltprid;
+
+	/*
+	 * Make sure that we have allocated dquot(s) on disk.
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		error = xfs_qm_vop_dqalloc(mp, dp,
+					   current->fsuid, current->fsgid,
+					   XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT,
+					   &udqp, &gdqp);
+		if (error)
+			goto std_return;
+	}
+
+ try_again:
+	ip = NULL;
+	dp_joined_to_trans = B_FALSE;
+	truncated = B_FALSE;
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+	resblks = XFS_CREATE_SPACE_RES(mp, namelen);
+	/*
+	 * Initially assume that the file does not exist and
+	 * reserve the resources for that case.	 If that is not
+	 * the case we'll drop the one we have and get a more
+	 * appropriate transaction later.
+	 */
+	error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+	if (error == ENOSPC) {
+		resblks = 0;
+		error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
+				XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+	}
+	if (error) {
+		cancel_flags = 0;
+		dp = NULL;
+		goto error_return;
+	}
+
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+	/*
+	 * At this point we cannot do an xfs_iget() of the entry named
+	 * since we are already holding a log reservation and it could
+	 * be in xfs_inactive waiting for a log reservation.  We'd just
+	 * end up waiting forever for the inactive to complete.	 Instead
+	 * we just look to see if there is an entry with the name.  In
+	 * the case where there is not, we didn't need the inode anyway.
+	 * In the case where there is an entry, we'll get it later after
+	 * dropping our transaction.
+	 */
+	error = xfs_dir_lookup_int(dir_bdp, 0, dentry, &e_inum, NULL);
+	if (error && (error != ENOENT)) {
+		goto error_return;
+	}
+
+	XFS_BMAP_INIT(&free_list, &first_block);
+
+	if (error == ENOENT) {
+
+		ASSERT(ip == NULL);
+
+		/*
+		 * Reserve disk quota and the inode.
+		 */
+		if (XFS_IS_QUOTA_ON(mp)) {
+			if (xfs_trans_reserve_quota(tp, udqp, gdqp, resblks,
+						    1, 0)) {
+				error = EDQUOT;
+				goto error_return;
+			}
+		}
+		if (resblks == 0 &&
+		    (error = XFS_DIR_CANENTER(mp, tp, dp, name, namelen)))
+			goto error_return;
+		rdev = (vap->va_mask & AT_RDEV) ? vap->va_rdev : 0;
+		error = xfs_dir_ialloc(&tp, dp,
+				MAKEIMODE(vap->va_type,vap->va_mode), 1,
+				rdev, credp, prid, resblks > 0,
+				&ip, &committed);
+		if (error) {
+			if (error == ENOSPC)
+				goto error_return;
+			goto abort_return;
+		}
+		ITRACE(ip);
+
+		/*
+		 * At this point, we've gotten a newly allocated inode.
+		 * It is locked (and joined to the transaction).
+		 */
+
+		ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
+
+		/*
+		 * Now we join the directory inode to the transaction.
+		 * We do not do it earlier because xfs_dir_ialloc
+		 * might commit the previous transaction (and release
+		 * all the locks).
+		 */
+
+		VN_HOLD(dir_vp);
+		xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+		dp_joined_to_trans = B_TRUE;
+
+		error = XFS_DIR_CREATENAME(mp, tp, dp, name, namelen, ip->i_ino,
+			&first_block, &free_list,
+			resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+		if (error) {
+			ASSERT(error != ENOSPC);
+			goto abort_return;
+		}
+		xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+		xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+		/*
+		 * If this is a synchronous mount, make sure that the
+		 * create transaction goes to disk before returning to
+		 * the user.
+		 */
+		if (mp->m_flags & XFS_MOUNT_WSYNC) {
+			xfs_trans_set_sync(tp);
+		}
+
+		dp->i_gen++;
+
+		/*
+		 * Attach the dquot(s) to the inodes and modify them incore.
+		 * These ids of the inode couldn't have changed since the new
+		 * inode has been locked ever since it was created.
+		 */
+		if (XFS_IS_QUOTA_ON(mp))
+			xfs_qm_vop_dqattach_and_dqmod_newinode(tp, ip, udqp,
+							       gdqp);
+		created = B_TRUE;
+	} else {
+		e_inum_saved = e_inum;
+
+		/*
+		 * The file already exists, so we're in the wrong
+		 * transaction for this operation.  Cancel the old
+		 * transaction and start a new one.  We have to drop
+		 * our locks in doing this. But, we don't care
+		 * if the directory changes. We have already checked
+		 * if the dir exists and is EXEC by the user.
+		 * After all, we already have the vnode held.
+		 *
+		 * All we need to do is truncate it.
+		 */
+		xfs_trans_cancel(tp, cancel_flags);
+		tp = NULL;
+
+		/*
+		 * Now that we've dropped our log reservation, get a
+		 * reference to the inode we are re-creating.  If we
+		 * have to drop the directory lock in the call and
+		 * the entry is removed, then just start over.
+		 */
+		error = xfs_dir_lookup_int(dir_bdp, DLF_IGET, dentry,
+					   &e_inum, &ip);
+		if (error) {
+			if (error == ENOENT) {
+				if (++xfs_create_retries >
+					XFS_CREATE_NEW_MAXTRIES) {
+					error = XFS_ERROR(EFSCORRUPTED);
+					goto error_return;
+				}
+
+				xfs_iunlock(dp, XFS_ILOCK_EXCL);
+				goto try_again;
+			}
+			goto error_return;
+		}
+		ITRACE(ip);
+
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+		dp = NULL;
+
+		/*
+		 * Done with directory. Don't care about it
+		 * anymore.
+		 */
+
+		/*
+		 * Since we're at a good, clean point, check for any
+		 * obvious problems and get out if they occur.
+		 */
+		vp = XFS_ITOV(ip);
+		if (!error) {
+			if (vp->v_type == VREG &&
+				(vap->va_mask & AT_SIZE) &&
+				DM_EVENT_ENABLED (vp->v_vfsp, ip,
+					DM_EVENT_TRUNCATE)) {
+					error = xfs_dm_send_data_event(
+						DM_EVENT_TRUNCATE,
+						XFS_ITOBHV(ip),
+						vap->va_size, 0, 0, NULL);
+			}
+		}
+
+		if (!error && XFS_IS_QUOTA_ON(mp)) {
+			if (XFS_NOT_DQATTACHED(mp, ip))
+				error = xfs_qm_dqattach(ip, 0);
+		}
+
+		if (error) {
+			IRELE(ip);
+			goto error_return;
+		}
+
+		/* Need the behaviour lock before the iolock as we are
+		 * potentially going to make a VOP call on vp. */
+		VN_BHV_READ_LOCK(&(vp)->v_bh);
+
+		/*
+		 * We need to do the xfs_itruncate_start call before
+		 * reserving any log space in the transaction.
+		 */
+		xfs_ilock(ip, XFS_IOLOCK_EXCL);
+		xfs_ctrunc_trace(XFS_CTRUNC1, ip);
+		if ((vp->v_type == VREG) &&
+		    (vap->va_mask & AT_SIZE) &&
+		    ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents != 0))) {
+			xfs_itruncate_start(ip, XFS_ITRUNC_MAYBE, 0);
+		}
+
+		/* Once we got the I/O lock we can drop the behaviour
+		 * lock.
+		 */
+		VN_BHV_READ_UNLOCK(&(vp)->v_bh);
+
+		tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TRUNC);
+		if ((error = xfs_trans_reserve(tp, 0,
+					      XFS_ITRUNCATE_LOG_RES(mp), 0,
+					      XFS_TRANS_PERM_LOG_RES,
+					      XFS_ITRUNCATE_LOG_COUNT))) {
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+			IRELE(ip);
+			cancel_flags = 0;
+			xfs_ctrunc_trace(XFS_CTRUNC2, ip);
+			goto error_return;
+		}
+
+		/*
+		 * Now lock inode.
+		 */
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+		if (error) {
+			xfs_ctrunc_trace(XFS_CTRUNC4, ip);
+			goto error_return;
+		}
+
+		if (vp->v_type == VREG && (vap->va_mask & AT_SIZE)) {
+			/*
+			 * Truncate the file.  The timestamps must
+			 * be updated whether the file is changed
+			 * or not.
+			 */
+			ASSERT(vap->va_size == 0);
+			if ((ip->i_d.di_size > 0) || (ip->i_d.di_nextents)) {
+				xfs_ctrunc_trace(XFS_CTRUNC5, ip);
+				xfs_trans_ihold(tp, ip);
+				/*
+				 * always signal sync xaction.	We're
+				 * truncating an existing file so the
+				 * xaction must be sync regardless
+				 * of whether the filesystem is wsync
+				 * to make the truncate persistent
+				 * in the face of a crash.
+				 */
+				error = xfs_itruncate_finish(&tp, ip,
+							     (xfs_fsize_t)0,
+							     XFS_DATA_FORK, 1);
+				if (error) {
+					ASSERT(ip->i_transp == tp);
+					xfs_trans_ihold_release(tp, ip);
+					goto abort_return;
+				}
+				truncated = B_TRUE;
+				xfs_ichgtime(ip,
+					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+				xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+			} else {
+				xfs_ichgtime(ip,
+					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+			}
+			inode_change = B_TRUE;
+
+		}
+		xfs_ctrunc_trace(XFS_CTRUNC6, ip);
+	}
+
+
+	/*
+	 * xfs_trans_commit normally decrements the vnode ref count
+	 * when it unlocks the inode. Since we want to return the
+	 * vnode to the caller, we bump the vnode ref count now.
+	 */
+	IHOLD(ip);
+	vp = XFS_ITOV(ip);
+
+	error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
+	if (error) {
+		xfs_bmap_cancel(&free_list);
+		if (truncated) {
+			/*
+			 * If we truncated the file, then the inode will
+			 * have been held within the previous transaction
+			 * and must be unlocked now.
+			 */
+			xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+			ASSERT(vn_count(vp) >= 2);
+			IRELE(ip);
+		}
+		goto abort_rele;
+	}
+
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	if (truncated) {
+		/*
+		 * If we truncated the file, then the inode will
+		 * have been held within the transaction and must
+		 * be unlocked now.
+		 */
+		xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+		ASSERT(vn_count(vp) >= 2);
+		IRELE(ip);
+	}
+	if (error) {
+		IRELE(ip);
+		tp = NULL;
+		goto error_return;
+	}
+
+	if (udqp)
+		xfs_qm_dqrele(udqp);
+	if (gdqp)
+		xfs_qm_dqrele(gdqp);
+
+	/*
+	 * Propogate the fact that the vnode changed after the
+	 * xfs_inode locks have been released.
+	 */
+	if (inode_change == B_TRUE) {
+		VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_TRUNCATED, 0);
+	} else {
+		VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_TRUNCATED, 3);
+	}
+
+	*vpp = vp;
+
+	/* Fallthrough to std_return with error = 0  */
+
+std_return:
+	if ( (created || (error != 0 && dm_event_sent != 0)) &&
+			DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
+							DM_EVENT_POSTCREATE)) {
+		(void) dm_send_namesp_event(DM_EVENT_POSTCREATE,
+			dir_bdp, DM_RIGHT_NULL,
+			created ? vn_bhv_lookup_unlocked(VN_BHV_HEAD(vp), &xfs_vnodeops):NULL,
+			DM_RIGHT_NULL, name, NULL,
+			dm_di_mode, error, 0);
+	}
+	return error;
+
+ abort_return:
+	cancel_flags |= XFS_TRANS_ABORT;
+	/* FALLTHROUGH */
+ error_return:
+
+	if (tp != NULL)
+		xfs_trans_cancel(tp, cancel_flags);
+
+	if (!dp_joined_to_trans && (dp != NULL))
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	if (udqp)
+		xfs_qm_dqrele(udqp);
+	if (gdqp)
+		xfs_qm_dqrele(gdqp);
+
+	goto std_return;
+
+ abort_rele:
+	/*
+	 * Wait until after the current transaction is aborted to
+	 * release the inode.  This prevents recursive transactions
+	 * and deadlocks from xfs_inactive.
+	 */
+	cancel_flags |= XFS_TRANS_ABORT;
+	xfs_trans_cancel(tp, cancel_flags);
+	IRELE(ip);
+
+	if (udqp)
+		xfs_qm_dqrele(udqp);
+	if (gdqp)
+		xfs_qm_dqrele(gdqp);
+
+	goto std_return;
+}
+
+#ifdef DEBUG
+
+/*
+ * Some counters to see if (and how often) we are hitting some deadlock
+ * prevention code paths.
+ */
+
+int xfs_rm_locks;
+int xfs_rm_lock_delays;
+int xfs_rm_attempts;
+#endif
+
+/*
+ * The following routine will lock the inodes associated with the
+ * directory and the named entry in the directory. The locks are
+ * acquired in increasing inode number.
+ *
+ * If the entry is "..", then only the directory is locked. The
+ * vnode ref count will still include that from the .. entry in
+ * this case.
+ *
+ * The inode passed in will have been looked up using xfs_get_dir_entry().
+ * Since that lookup the directory lock will have been dropped, so
+ * we need to validate that the inode given is still pointed to by the
+ * directory.  We use the directory inode in memory generation count
+ * as an optimization to tell if a new lookup is necessary.  If the
+ * directory no longer points to the given inode with the given name,
+ * then we drop the directory lock, set the entry_changed parameter to 1,
+ * and return.	It is up to the caller to drop the reference to the inode.
+ *
+ * There is a dealock we need to worry about. If the locked directory is
+ * in the AIL, it might be blocking up the log. The next inode we lock
+ * could be already locked by another thread waiting for log space (e.g
+ * a permanent log reservation with a long running transaction (see
+ * xfs_itruncate_finish)). To solve this, we must check if the directory
+ * is in the ail and use lock_nowait. If we can't lock, we need to
+ * drop the inode lock on the directory and try again. xfs_iunlock will
+ * potentially push the tail if we were holding up the log.
+ */
+STATIC int
+xfs_lock_dir_and_entry(
+	xfs_inode_t	*dp,
+	struct dentry	*dentry,
+	xfs_inode_t	*ip,	/* inode of entry 'name' */
+	int		*entry_changed)
+{
+	int			attempts;
+	xfs_ino_t		e_inum;
+	xfs_inode_t		*ips[2];
+	xfs_log_item_t		*lp;
+
+#ifdef DEBUG
+	xfs_rm_locks++;
+#endif
+	attempts = 0;
+
+again:
+	*entry_changed = 0;
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+	e_inum = ip->i_ino;
+
+	ITRACE(ip);
+
+	/*
+	 * We want to lock in increasing inum. Since we've already
+	 * acquired the lock on the directory, we may need to release
+	 * if if the inum of the entry turns out to be less.
+	 */
+	if (e_inum > dp->i_ino) {
+		/*
+		 * We are already in the right order, so just
+		 * lock on the inode of the entry.
+		 * We need to use nowait if dp is in the AIL.
+		 */
+
+		lp = (xfs_log_item_t *)dp->i_itemp;
+		if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+			if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+				attempts++;
+#ifdef DEBUG
+				xfs_rm_attempts++;
+#endif
+
+				/*
+				 * Unlock dp and try again.
+				 * xfs_iunlock will try to push the tail
+				 * if the inode is in the AIL.
+				 */
+
+				xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+				if ((attempts % 5) == 0) {
+					delay(1); /* Don't just spin the CPU */
+#ifdef DEBUG
+					xfs_rm_lock_delays++;
+#endif
+				}
+				goto again;
+			}
+		} else {
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+		}
+	} else if (e_inum < dp->i_ino) {
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+
+		ips[0] = ip;
+		ips[1] = dp;
+		xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
+	}
+	/* else	 e_inum == dp->i_ino */
+	/*     This can happen if we're asked to lock /x/..
+	 *     the entry is "..", which is also the parent directory.
+	 */
+
+	return 0;
+}
+
+#ifdef DEBUG
+int xfs_locked_n;
+int xfs_small_retries;
+int xfs_middle_retries;
+int xfs_lots_retries;
+int xfs_lock_delays;
+#endif
+
+/*
+ * The following routine will lock n inodes in exclusive mode.
+ * We assume the caller calls us with the inodes in i_ino order.
+ *
+ * We need to detect deadlock where an inode that we lock
+ * is in the AIL and we start waiting for another inode that is locked
+ * by a thread in a long running transaction (such as truncate). This can
+ * result in deadlock since the long running trans might need to wait
+ * for the inode we just locked in order to push the tail and free space
+ * in the log.
+ */
+void
+xfs_lock_inodes (xfs_inode_t **ips,
+	int inodes,
+	int first_locked,
+	uint lock_mode)
+{
+	int attempts = 0, i, j, try_lock;
+	xfs_log_item_t	*lp;
+
+	ASSERT(ips && (inodes >= 2)); /* we need at least two */
+
+	if (first_locked) {
+		try_lock = 1;
+		i = 1;
+	} else {
+		try_lock = 0;
+		i = 0;
+	}
+
+again:
+	for (; i < inodes; i++) {
+		ASSERT(ips[i]);
+
+		if (i && (ips[i] == ips[i-1]))	/* Already locked */
+			continue;
+
+		/*
+		 * If try_lock is not set yet, make sure all locked inodes
+		 * are not in the AIL.
+		 * If any are, set try_lock to be used later.
+		 */
+
+		if (!try_lock) {
+			for (j = (i - 1); j >= 0 && !try_lock; j--) {
+				lp = (xfs_log_item_t *)ips[j]->i_itemp;
+				if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+					try_lock++;
+				}
+			}
+		}
+
+		/*
+		 * If any of the previous locks we have locked is in the AIL,
+		 * we must TRY to get the second and subsequent locks. If
+		 * we can't get any, we must release all we have
+		 * and try again.
+		 */
+
+		if (try_lock) {
+			/* try_lock must be 0 if i is 0. */
+			/*
+			 * try_lock means we have an inode locked
+			 * that is in the AIL.
+			 */
+			ASSERT(i != 0);
+			if (!xfs_ilock_nowait(ips[i], lock_mode)) {
+				attempts++;
+
+				/*
+				 * Unlock all previous guys and try again.
+				 * xfs_iunlock will try to push the tail
+				 * if the inode is in the AIL.
+				 */
+
+				for(j = i - 1; j >= 0; j--) {
+
+					/*
+					 * Check to see if we've already
+					 * unlocked this one.
+					 * Not the first one going back,
+					 * and the inode ptr is the same.
+					 */
+					if ((j != (i - 1)) && ips[j] ==
+								ips[j+1])
+						continue;
+
+					xfs_iunlock(ips[j], lock_mode);
+				}
+
+				if ((attempts % 5) == 0) {
+					delay(1); /* Don't just spin the CPU */
+#ifdef DEBUG
+					xfs_lock_delays++;
+#endif
+				}
+				i = 0;
+				try_lock = 0;
+				goto again;
+			}
+		} else {
+			xfs_ilock(ips[i], lock_mode);
+		}
+	}
+
+#ifdef DEBUG
+	if (attempts) {
+		if (attempts < 5) xfs_small_retries++;
+		else if (attempts < 100) xfs_middle_retries++;
+		else xfs_lots_retries++;
+	} else {
+		xfs_locked_n++;
+	}
+#endif
+}
+
+#ifdef	DEBUG
+#define REMOVE_DEBUG_TRACE(x)	{remove_which_error_return = (x);}
+int remove_which_error_return = 0;
+#else /* ! DEBUG */
+#define REMOVE_DEBUG_TRACE(x)
+#endif	/* ! DEBUG */
+
+/*
+ * xfs_remove
+ *
+ */
+STATIC int
+xfs_remove(
+	bhv_desc_t	*dir_bdp,
+	struct dentry	*dentry,
+	cred_t		*credp)
+{
+	vnode_t			*dir_vp;
+	char			*name = (char *) dentry->d_name.name;
+	xfs_inode_t		*dp, *ip;
+	xfs_trans_t		*tp = NULL;
+	xfs_mount_t		*mp;
+	int			error = 0;
+	xfs_bmap_free_t		free_list;
+	xfs_fsblock_t		first_block;
+	int			cancel_flags;
+	int			committed;
+	int			entry_changed;
+	int			dm_di_mode = 0;
+	int			link_zero;
+	uint			resblks;
+	int			namelen;
+/*	bhv_desc_t		*bdp; */
+
+	dir_vp = BHV_TO_VNODE(dir_bdp);
+
+	vn_trace_entry(dir_vp, "xfs_remove", (inst_t *)__return_address);
+
+	dp = XFS_BHVTOI(dir_bdp);
+	mp = dp->i_mount;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	namelen = dentry->d_name.len;
+	if (namelen >= MAXNAMELEN)
+		return XFS_ERROR(ENAMETOOLONG);
+	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
+		error = dm_send_namesp_event(DM_EVENT_REMOVE, dir_bdp, DM_RIGHT_NULL,
+					NULL, DM_RIGHT_NULL,
+					name, NULL, 0, 0, 0);
+		if (error)
+			return error;
+	}
+
+	/* From this point on, return through std_return */
+ retry:
+	ip = NULL;
+
+	/*
+	 * We need to get a reference to ip before we get our log
+	 * reservation. The reason for this is that we cannot call
+	 * xfs_iget for an inode for which we do not have a reference
+	 * once we've acquired a log reservation. This is because the
+	 * inode we are trying to get might be in xfs_inactive going
+	 * for a log reservation. Since we'll have to wait for the
+	 * inactive code to complete before returning from xfs_iget,
+	 * we need to make sure that we don't have log space reserved
+	 * when we call xfs_iget.  Instead we get an unlocked referece
+	 * to the inode before getting our log reservation.
+	 */
+	error = xfs_get_dir_entry(dentry, &ip);
+	if (error) {
+		REMOVE_DEBUG_TRACE(__LINE__);
+		goto std_return;
+	}
+
+	dm_di_mode = ip->i_d.di_mode;
+
+	vn_trace_entry(XFS_ITOV(ip), "xfs_remove", (inst_t *)__return_address);
+
+	ITRACE(ip);
+
+	if (XFS_IS_QUOTA_ON(mp)) {
+		ASSERT(! error);
+		if (XFS_NOT_DQATTACHED(mp, dp))
+			error = xfs_qm_dqattach(dp, 0);
+		if (!error && dp != ip && XFS_NOT_DQATTACHED(mp, ip))
+			error = xfs_qm_dqattach(ip, 0);
+		if (error) {
+			REMOVE_DEBUG_TRACE(__LINE__);
+			IRELE(ip);
+			goto std_return;
+		}
+	}
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+	/*
+	 * We try to get the real space reservation first,
+	 * allowing for directory btree deletion(s) implying
+	 * possible bmap insert(s).  If we can't get the space
+	 * reservation then we use 0 instead, and avoid the bmap
+	 * btree insert(s) in the directory code by, if the bmap
+	 * insert tries to happen, instead trimming the LAST
+	 * block from the directory.
+	 */
+	resblks = XFS_REMOVE_SPACE_RES(mp);
+	error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
+	if (error == ENOSPC) {
+		resblks = 0;
+		error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
+				XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
+	}
+	if (error) {
+		ASSERT(error != ENOSPC);
+		REMOVE_DEBUG_TRACE(__LINE__);
+		xfs_trans_cancel(tp, 0);
+		IRELE(ip);
+		return error;
+	}
+
+	error = xfs_lock_dir_and_entry(dp, dentry, ip, &entry_changed);
+	if (error) {
+		REMOVE_DEBUG_TRACE(__LINE__);
+		xfs_trans_cancel(tp, cancel_flags);
+		IRELE(ip);
+		goto std_return;
+	}
+
+	/*
+	 * If the inode we found in the first pass is no longer
+	 * the entry with the given name, then drop our transaction and
+	 * inode reference and start over.
+	 */
+	if (entry_changed) {
+		xfs_trans_cancel(tp, cancel_flags);
+		IRELE(ip);
+		goto retry;
+	}
+
+	/*
+	 * At this point, we've gotten both the directory and the entry
+	 * inodes locked.
+	 */
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+	if (dp != ip) {
+		/*
+		 * Increment vnode ref count only in this case since
+		 * there's an extra vnode reference in the case where
+		 * dp == ip.
+		 */
+		IHOLD(dp);
+		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	}
+
+	if ((error = _MAC_XFS_IACCESS(ip, MACWRITE, credp))) {
+		REMOVE_DEBUG_TRACE(__LINE__);
+		goto error_return;
+	}
+
+	if ((ip->i_d.di_mode & IFMT) == IFDIR) {
+		error = XFS_ERROR(EPERM);
+		REMOVE_DEBUG_TRACE(__LINE__);
+		goto error_return;
+	}
+
+	/*
+	 * Return error when removing . and ..
+	 */
+	if (name[0] == '.') {
+		if (name[1] == '\0') {
+			error = XFS_ERROR(EINVAL);
+			REMOVE_DEBUG_TRACE(__LINE__);
+			goto error_return;
+		}
+		else if (name[1] == '.' && name[2] == '\0') {
+			error = XFS_ERROR(EEXIST);
+			REMOVE_DEBUG_TRACE(__LINE__);
+			goto error_return;
+		}
+	}
+
+	/*
+	 * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
+	 */
+	XFS_BMAP_INIT(&free_list, &first_block);
+	error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, ip->i_ino,
+		&first_block, &free_list, 0);
+	if (error) {
+		ASSERT(error != ENOENT);
+		REMOVE_DEBUG_TRACE(__LINE__);
+		goto error1;
+	}
+	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+	dp->i_gen++;
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+	error = xfs_droplink(tp, ip);
+	if (error) {
+		REMOVE_DEBUG_TRACE(__LINE__);
+		goto error1;
+	}
+
+	/* Determine if this is the last link while
+	 * we are in the transaction.
+	 */
+	link_zero = (ip)->i_d.di_nlink==0;
+
+	/*
+	 * Take an extra ref on the inode so that it doesn't
+	 * go to xfs_inactive() from within the commit.
+	 */
+	IHOLD(ip);
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * remove transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC) {
+		xfs_trans_set_sync(tp);
+	}
+
+	error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
+	if (error) {
+		REMOVE_DEBUG_TRACE(__LINE__);
+		goto error_rele;
+	}
+
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	if (error) {
+		IRELE(ip);
+		goto std_return;
+	}
+
+	/*
+	 * Before we drop our extra reference to the inode, purge it
+	 * from the refcache if it is there.  By waiting until afterwards
+	 * to do the IRELE, we ensure that we won't go inactive in the
+	 * xfs_refcache_purge_ip routine (although that would be OK).
+	 */
+	xfs_refcache_purge_ip(ip);
+
+	vn_trace_exit(XFS_ITOV(ip), "xfs_remove",
+						(inst_t *)__return_address);
+
+	/*
+	 * Let interposed file systems know about removed links.
+	 */
+	VOP_LINK_REMOVED(XFS_ITOV(ip), dir_vp, link_zero);
+
+	IRELE(ip);
+
+/*	Fall through to std_return with error = 0 */
+
+std_return:
+	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp,
+						DM_EVENT_POSTREMOVE)) {
+		(void) dm_send_namesp_event(DM_EVENT_POSTREMOVE,
+				dir_bdp, DM_RIGHT_NULL,
+				NULL, DM_RIGHT_NULL,
+				name, NULL, dm_di_mode, error, 0);
+	}
+	return error;
+
+ error1:
+	xfs_bmap_cancel(&free_list);
+	cancel_flags |= XFS_TRANS_ABORT;
+
+ error_return:
+	xfs_trans_cancel(tp, cancel_flags);
+	goto std_return;
+
+ error_rele:
+	/*
+	 * In this case make sure to not release the inode until after
+	 * the current transaction is aborted.	Releasing it beforehand
+	 * can cause us to go to xfs_inactive and start a recursive
+	 * transaction which can easily deadlock with the current one.
+	 */
+	xfs_bmap_cancel(&free_list);
+	cancel_flags |= XFS_TRANS_ABORT;
+	xfs_trans_cancel(tp, cancel_flags);
+
+	/*
+	 * Before we drop our extra reference to the inode, purge it
+	 * from the refcache if it is there.  By waiting until afterwards
+	 * to do the IRELE, we ensure that we won't go inactive in the
+	 * xfs_refcache_purge_ip routine (although that would be OK).
+	 */
+	xfs_refcache_purge_ip(ip);
+
+	IRELE(ip);
+
+	goto std_return;
+}
+
+
+/*
+ * xfs_link
+ *
+ */
+STATIC int
+xfs_link(
+	bhv_desc_t	*target_dir_bdp,
+	vnode_t		*src_vp,
+	struct dentry	*dentry,
+	cred_t		*credp)
+{
+	xfs_inode_t		*tdp, *sip;
+	xfs_ino_t		e_inum;
+	xfs_trans_t		*tp;
+	xfs_mount_t		*mp;
+	xfs_inode_t		*ips[2];
+	int			error;
+	xfs_bmap_free_t		free_list;
+	xfs_fsblock_t		first_block;
+	int			cancel_flags;
+	int			committed;
+	vnode_t			*target_dir_vp;
+	bhv_desc_t		*src_bdp;
+	int			resblks;
+	char			*target_name = (char *)dentry->d_name.name;
+	int			target_namelen;
+
+	target_dir_vp = BHV_TO_VNODE(target_dir_bdp);
+
+	vn_trace_entry(target_dir_vp, "xfs_link", (inst_t *)__return_address);
+
+	target_namelen = dentry->d_name.len;
+	if (target_namelen >= MAXNAMELEN)
+		return XFS_ERROR(ENAMETOOLONG);
+
+	vn_trace_entry(src_vp, "xfs_link", (inst_t *)__return_address);
+
+	/*
+	 * For now, manually find the XFS behavior descriptor for
+	 * the source vnode.  If it doesn't exist then something
+	 * is wrong and we should just return an error.
+	 * Eventually we need to figure out how link is going to
+	 * work in the face of stacked vnodes.
+	 */
+	src_bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(src_vp), &xfs_vnodeops);
+	if (src_bdp == NULL) {
+		return XFS_ERROR(EXDEV);
+	}
+	sip = XFS_BHVTOI(src_bdp);
+	tdp = XFS_BHVTOI(target_dir_bdp);
+	mp = tdp->i_mount;
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	if (DM_EVENT_ENABLED(src_vp->v_vfsp, tdp, DM_EVENT_LINK)) {
+		error = dm_send_namesp_event(DM_EVENT_LINK,
+					target_dir_bdp, DM_RIGHT_NULL,
+					src_bdp, DM_RIGHT_NULL,
+					target_name, NULL, 0, 0, 0);
+		if (error)
+			return error;
+	}
+
+	/* Return through std_return after this point. */
+
+	if (XFS_IS_QUOTA_ON(mp)) {
+		error = 0;
+		if (XFS_NOT_DQATTACHED(mp, sip))
+			error = xfs_qm_dqattach(sip, 0);
+		if (!error && sip != tdp && XFS_NOT_DQATTACHED(mp, tdp))
+			error = xfs_qm_dqattach(tdp, 0);
+		if (error)
+			goto std_return;
+	}
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+	resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
+	error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
+	if (error == ENOSPC) {
+		resblks = 0;
+		error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
+				XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
+	}
+	if (error) {
+		cancel_flags = 0;
+		goto error_return;
+	}
+
+	if (sip->i_ino < tdp->i_ino) {
+		ips[0] = sip;
+		ips[1] = tdp;
+	} else {
+		ips[0] = tdp;
+		ips[1] = sip;
+	}
+
+	xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
+
+	/*
+	 * Increment vnode ref counts since xfs_trans_commit &
+	 * xfs_trans_cancel will both unlock the inodes and
+	 * decrement the associated ref counts.
+	 */
+	VN_HOLD(src_vp);
+	VN_HOLD(target_dir_vp);
+	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
+
+	/*
+	 * If the source has too many links, we can't make any more to it.
+	 */
+	if (sip->i_d.di_nlink >= XFS_MAXLINK) {
+		error = XFS_ERROR(EMLINK);
+		goto error_return;
+	}
+
+	/*
+	 * Make sure that nothing with the given name exists in the
+	 * target directory.
+	 */
+	error = xfs_dir_lookup_int(target_dir_bdp, 0, dentry, &e_inum, NULL);
+	if (error != ENOENT) {
+		if (error == 0) {
+			error = XFS_ERROR(EEXIST);
+		}
+		goto error_return;
+	}
+
+	if (resblks == 0 &&
+	    (error = XFS_DIR_CANENTER(mp, tp, tdp, target_name,
+			target_namelen)))
+		goto error_return;
+
+	XFS_BMAP_INIT(&free_list, &first_block);
+
+	error = XFS_DIR_CREATENAME(mp, tp, tdp, target_name, target_namelen,
+				   sip->i_ino, &first_block, &free_list,
+				   resblks);
+	if (error)
+		goto abort_return;
+	xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	tdp->i_gen++;
+	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
+
+	error = xfs_bumplink(tp, sip);
+	if (error) {
+		goto abort_return;
+	}
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * link transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC) {
+		xfs_trans_set_sync(tp);
+	}
+
+	error = xfs_bmap_finish (&tp, &free_list, first_block, &committed);
+	if (error) {
+		xfs_bmap_cancel(&free_list);
+		goto abort_return;
+	}
+
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	if (error) {
+		goto std_return;
+	}
+
+	/* Fall through to std_return with error = 0. */
+std_return:
+	if (DM_EVENT_ENABLED(src_vp->v_vfsp, sip,
+						DM_EVENT_POSTLINK)) {
+		(void) dm_send_namesp_event(DM_EVENT_POSTLINK,
+				target_dir_bdp, DM_RIGHT_NULL,
+				src_bdp, DM_RIGHT_NULL,
+				target_name, NULL, 0, error, 0);
+	}
+	return error;
+
+ abort_return:
+	cancel_flags |= XFS_TRANS_ABORT;
+	/* FALLTHROUGH */
+ error_return:
+	xfs_trans_cancel(tp, cancel_flags);
+
+	goto std_return;
+}
+
+
+
+
+/*
+ * xfs_mkdir
+ *
+ */
+STATIC int
+xfs_mkdir(
+	bhv_desc_t	*dir_bdp,
+	struct dentry	*dentry,
+	vattr_t		*vap,
+	vnode_t		**vpp,
+	cred_t		*credp)
+{
+	char			*dir_name = (char *)dentry->d_name.name;
+	xfs_inode_t		*dp;
+	xfs_inode_t		*cdp;	/* inode of created dir */
+	vnode_t			*cvp;	/* vnode of created dir */
+	xfs_trans_t		*tp;
+	xfs_ino_t		e_inum;
+	dev_t			rdev;
+	mode_t			mode;
+	xfs_mount_t		*mp;
+	int			cancel_flags;
+	int			error;
+	int			committed;
+	xfs_bmap_free_t		free_list;
+	xfs_fsblock_t		first_block;
+	vnode_t			*dir_vp;
+	boolean_t		dp_joined_to_trans;
+	boolean_t		created = B_FALSE;
+	int			dm_event_sent = 0;
+	xfs_prid_t		prid;
+	struct xfs_dquot	*udqp, *gdqp;
+	uint			resblks;
+	int			dm_di_mode;
+	int			dir_namelen;
+
+	dir_vp = BHV_TO_VNODE(dir_bdp);
+	dp = XFS_BHVTOI(dir_bdp);
+	mp = dp->i_mount;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	dir_namelen = dentry->d_name.len;
+	if (dir_namelen >= MAXNAMELEN)
+		return XFS_ERROR(ENAMETOOLONG);
+
+	tp = NULL;
+	dp_joined_to_trans = B_FALSE;
+	dm_di_mode = vap->va_mode|VTTOIF(vap->va_type);
+
+	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
+		error = xfs_dm_send_create_event(dir_bdp, dir_name,
+					dm_di_mode, &dm_event_sent);
+		if (error)
+			return error;
+	}
+
+	/* Return through std_return after this point. */
+
+	vn_trace_entry(dir_vp, "xfs_mkdir", (inst_t *)__return_address);
+
+	mp = dp->i_mount;
+	udqp = gdqp = NULL;
+	if (vap->va_mask & AT_PROJID)
+		prid = (xfs_prid_t)vap->va_projid;
+	else
+		prid = (xfs_prid_t)dfltprid;
+
+	/*
+	 * Make sure that we have allocated dquot(s) on disk.
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		error = xfs_qm_vop_dqalloc(mp, dp,
+					   current->fsuid, current->fsgid,
+					   XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT,
+					   &udqp, &gdqp);
+		if (error)
+			goto std_return;
+	}
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+	resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
+	error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
+				  XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
+	if (error == ENOSPC) {
+		resblks = 0;
+		error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
+					  XFS_TRANS_PERM_LOG_RES,
+					  XFS_MKDIR_LOG_COUNT);
+	}
+	if (error) {
+		cancel_flags = 0;
+		dp = NULL;
+		goto error_return;
+	}
+
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+	/*
+	 * Check for directory link count overflow.
+	 */
+	if (dp->i_d.di_nlink >= XFS_MAXLINK) {
+		error = XFS_ERROR(EMLINK);
+		goto error_return;
+	}
+
+	/*
+	 * Make sure that nothing with the given name exists in the
+	 * target directory.
+	 */
+	error = xfs_dir_lookup_int(dir_bdp, 0, dentry, &e_inum, NULL);
+	if (error != ENOENT) {
+		if (error == 0)
+			error = XFS_ERROR(EEXIST);
+		goto error_return;
+	}
+
+
+	/*
+	 * Reserve disk quota and the inode.
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if (xfs_trans_reserve_quota(tp, udqp, gdqp, resblks, 1, 0)) {
+			error = XFS_ERROR(EDQUOT);
+			goto error_return;
+		}
+	}
+
+	if (resblks == 0 &&
+	    (error = XFS_DIR_CANENTER(mp, tp, dp, dir_name, dir_namelen)))
+		goto error_return;
+	/*
+	 * create the directory inode.
+	 */
+	rdev = (vap->va_mask & AT_RDEV) ? vap->va_rdev : 0;
+	mode = IFDIR | (vap->va_mode & ~IFMT);
+	error = xfs_dir_ialloc(&tp, dp, mode, 2, rdev, credp, prid, resblks > 0,
+		&cdp, NULL);
+	if (error) {
+		if (error == ENOSPC)
+			goto error_return;
+		goto abort_return;
+	}
+	ITRACE(cdp);
+
+	/*
+	 * Now we add the directory inode to the transaction.
+	 * We waited until now since xfs_dir_ialloc might start
+	 * a new transaction.  Had we joined the transaction
+	 * earlier, the locks might have gotten released.
+	 */
+	VN_HOLD(dir_vp);
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+	dp_joined_to_trans = B_TRUE;
+
+	XFS_BMAP_INIT(&free_list, &first_block);
+
+	error = XFS_DIR_CREATENAME(mp, tp, dp, dir_name, dir_namelen,
+			cdp->i_ino, &first_block, &free_list,
+			resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
+	if (error) {
+		ASSERT(error != ENOSPC);
+		goto error1;
+	}
+	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+	/*
+	 * Bump the in memory version number of the parent directory
+	 * so that other processes accessing it will recognize that
+	 * the directory has changed.
+	 */
+	dp->i_gen++;
+
+	error = XFS_DIR_INIT(mp, tp, cdp, dp);
+	if (error) {
+		goto error2;
+	}
+
+	cdp->i_gen = 1;
+	error = xfs_bumplink(tp, dp);
+	if (error) {
+		goto error2;
+	}
+
+	cvp = XFS_ITOV(cdp);
+
+	created = B_TRUE;
+
+	*vpp = cvp;
+	IHOLD(cdp);
+
+	/*
+	 * Attach the dquots to the new inode and modify the icount incore.
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		xfs_qm_vop_dqattach_and_dqmod_newinode(tp, cdp, udqp, gdqp);
+	}
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * mkdir transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC) {
+		xfs_trans_set_sync(tp);
+	}
+
+	error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
+	if (error) {
+		IRELE(cdp);
+		goto error2;
+	}
+
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	if (udqp)
+		 xfs_qm_dqrele(udqp);
+	if (gdqp)
+		 xfs_qm_dqrele(gdqp);
+
+	if (error) {
+		IRELE(cdp);
+	}
+
+	/* Fall through to std_return with error = 0 or errno from
+	 * xfs_trans_commit. */
+
+std_return:
+	if ( (created || (error != 0 && dm_event_sent != 0)) &&
+			DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
+						DM_EVENT_POSTCREATE)) {
+		(void) dm_send_namesp_event(DM_EVENT_POSTCREATE,
+					dir_bdp, DM_RIGHT_NULL,
+					created ? XFS_ITOBHV(cdp):NULL,
+					DM_RIGHT_NULL,
+					dir_name, NULL,
+					dm_di_mode, error, 0);
+	}
+	return error;
+
+ error2:
+ error1:
+	xfs_bmap_cancel(&free_list);
+ abort_return:
+	cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+	xfs_trans_cancel(tp, cancel_flags);
+
+	if (udqp)
+		xfs_qm_dqrele(udqp);
+	if (gdqp)
+		xfs_qm_dqrele(gdqp);
+
+	if (!dp_joined_to_trans && (dp != NULL)) {
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	}
+
+	goto std_return;
+}
+
+
+/*
+ * xfs_rmdir
+ *
+ */
+STATIC int
+xfs_rmdir(
+	bhv_desc_t	*dir_bdp,
+	struct dentry	*dentry,
+	cred_t		*credp)
+{
+	char			*name = (char *)dentry->d_name.name;
+	xfs_inode_t		*dp;
+	xfs_inode_t		*cdp;	/* child directory */
+	xfs_trans_t		*tp;
+	xfs_mount_t		*mp;
+/*	bhv_desc_t		*bdp;*/
+	int			error;
+	xfs_bmap_free_t		free_list;
+	xfs_fsblock_t		first_block;
+	int			cancel_flags;
+	int			committed;
+	int			entry_changed;
+	vnode_t			*dir_vp;
+	int			dm_di_mode = 0;
+	int			last_cdp_link;
+	int			namelen;
+	uint			resblks;
+
+	dir_vp = BHV_TO_VNODE(dir_bdp);
+	dp = XFS_BHVTOI(dir_bdp);
+
+	vn_trace_entry(dir_vp, "xfs_rmdir", (inst_t *)__return_address);
+
+	if (XFS_FORCED_SHUTDOWN(XFS_BHVTOI(dir_bdp)->i_mount))
+		return XFS_ERROR(EIO);
+	namelen = dentry->d_name.len;
+	if (namelen >= MAXNAMELEN)
+		return XFS_ERROR(ENAMETOOLONG);
+
+	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
+		error = dm_send_namesp_event(DM_EVENT_REMOVE,
+					dir_bdp, DM_RIGHT_NULL,
+					NULL, DM_RIGHT_NULL,
+					name, NULL, 0, 0, 0);
+		if (error)
+			return XFS_ERROR(error);
+	}
+
+	/* Return through std_return after this point. */
+
+ retry:
+	cdp = NULL;
+
+	/*
+	 * We need to get a reference to cdp before we get our log
+	 * reservation.	 The reason for this is that we cannot call
+	 * xfs_iget for an inode for which we do not have a reference
+	 * once we've acquired a log reservation.  This is because the
+	 * inode we are trying to get might be in xfs_inactive going
+	 * for a log reservation.  Since we'll have to wait for the
+	 * inactive code to complete before returning from xfs_iget,
+	 * we need to make sure that we don't have log space reserved
+	 * when we call xfs_iget.  Instead we get an unlocked referece
+	 * to the inode before getting our log reservation.
+	 */
+	error = xfs_get_dir_entry(dentry, &cdp);
+	if (error) {
+		REMOVE_DEBUG_TRACE(__LINE__);
+		goto std_return;
+	}
+	mp = dp->i_mount;
+	dm_di_mode = cdp->i_d.di_mode;
+
+	/*
+	 * Get the dquots for the inodes.
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		ASSERT(! error);
+		if (XFS_NOT_DQATTACHED(mp, dp))
+			error = xfs_qm_dqattach(dp, 0);
+		if (!error && dp != cdp && XFS_NOT_DQATTACHED(mp, cdp))
+			error = xfs_qm_dqattach(cdp, 0);
+		if (error) {
+			IRELE(cdp);
+			REMOVE_DEBUG_TRACE(__LINE__);
+			goto std_return;
+		}
+	}
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+	/*
+	 * We try to get the real space reservation first,
+	 * allowing for directory btree deletion(s) implying
+	 * possible bmap insert(s).  If we can't get the space
+	 * reservation then we use 0 instead, and avoid the bmap
+	 * btree insert(s) in the directory code by, if the bmap
+	 * insert tries to happen, instead trimming the LAST
+	 * block from the directory.
+	 */
+	resblks = XFS_REMOVE_SPACE_RES(mp);
+	error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
+	if (error == ENOSPC) {
+		resblks = 0;
+		error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
+				XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
+	}
+	if (error) {
+		ASSERT(error != ENOSPC);
+		cancel_flags = 0;
+		IRELE(cdp);
+		goto error_return;
+	}
+	XFS_BMAP_INIT(&free_list, &first_block);
+
+	/*
+	 * Now lock the child directory inode and the parent directory
+	 * inode in the proper order.  This will take care of validating
+	 * that the directory entry for the child directory inode has
+	 * not changed while we were obtaining a log reservation.
+	 */
+	error = xfs_lock_dir_and_entry(dp, dentry, cdp, &entry_changed);
+	if (error) {
+		xfs_trans_cancel(tp, cancel_flags);
+		IRELE(cdp);
+		goto std_return;
+	}
+
+	/*
+	 * If the inode we found in the first pass is no longer
+	 * the entry with the given name, then drop our transaction and
+	 * inode reference and start over.
+	 */
+	if (entry_changed) {
+		xfs_trans_cancel(tp, cancel_flags);
+		IRELE(cdp);
+		goto retry;
+	}
+
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+	if (dp != cdp) {
+		/*
+		 * Only increment the parent directory vnode count if
+		 * we didn't bump it in looking up cdp.	 The only time
+		 * we don't bump it is when we're looking up ".".
+		 */
+		VN_HOLD(dir_vp);
+	}
+
+	ITRACE(cdp);
+	xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
+
+	if ((error = _MAC_XFS_IACCESS(cdp, MACWRITE, credp))) {
+		goto error_return;
+	}
+
+	ASSERT(cdp->i_d.di_nlink >= 2);
+	if (cdp->i_d.di_nlink != 2) {
+		error = XFS_ERROR(ENOTEMPTY);
+		goto error_return;
+	}
+	if (!XFS_DIR_ISEMPTY(mp, cdp)) {
+		error = XFS_ERROR(ENOTEMPTY);
+		goto error_return;
+	}
+
+	error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, cdp->i_ino,
+		&first_block, &free_list, resblks);
+	if (error) {
+		goto error1;
+	}
+
+	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+	/*
+	 * Bump the in memory generation count on the parent
+	 * directory so that other can know that it has changed.
+	 */
+	dp->i_gen++;
+
+	/*
+	 * Drop the link from cdp's "..".
+	 */
+	error = xfs_droplink(tp, dp);
+	if (error) {
+		goto error1;
+	}
+
+	/*
+	 * Drop the link from dp to cdp.
+	 */
+	error = xfs_droplink(tp, cdp);
+	if (error) {
+		goto error1;
+	}
+
+	/*
+	 * Drop the "." link from cdp to self.
+	 */
+	error = xfs_droplink(tp, cdp);
+	if (error) {
+		goto error1;
+	}
+
+	/* Determine these before committing transaction */
+	last_cdp_link = (cdp)->i_d.di_nlink==0;
+
+	/*
+	 * Take an extra ref on the child vnode so that it
+	 * does not go to xfs_inactive() from within the commit.
+	 */
+	IHOLD(cdp);
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * rmdir transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC) {
+		xfs_trans_set_sync(tp);
+	}
+
+	error = xfs_bmap_finish (&tp, &free_list, first_block, &committed);
+	if (error) {
+		xfs_bmap_cancel(&free_list);
+		xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
+				 XFS_TRANS_ABORT));
+		IRELE(cdp);
+		goto std_return;
+	}
+
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	if (error) {
+		IRELE(cdp);
+		goto std_return;
+	}
+
+
+	/*
+	 * Let interposed file systems know about removed links.
+	 */
+	VOP_LINK_REMOVED(XFS_ITOV(cdp), dir_vp, last_cdp_link);
+
+	IRELE(cdp);
+
+	/* Fall through to std_return with error = 0 or the errno
+	 * from xfs_trans_commit. */
+std_return:
+	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp,
+						DM_EVENT_POSTREMOVE)) {
+		(void) dm_send_namesp_event(DM_EVENT_POSTREMOVE,
+					dir_bdp, DM_RIGHT_NULL,
+					NULL, DM_RIGHT_NULL,
+					name, NULL, dm_di_mode,
+					error, 0);
+	}
+	return error;
+
+ error1:
+	xfs_bmap_cancel(&free_list);
+	cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+	xfs_trans_cancel(tp, cancel_flags);
+	goto std_return;
+}
+
+
+
+/*
+ * xfs_readdir
+ *
+ * Read dp's entries starting at uiop->uio_offset and translate them into
+ * bufsize bytes worth of struct dirents starting at bufbase.
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_readdir(
+	bhv_desc_t	*dir_bdp,
+	uio_t		*uiop,
+	cred_t		*credp,
+	int		*eofp)
+{
+	xfs_inode_t		*dp;
+	xfs_trans_t		*tp = NULL;
+	int			error = 0;
+	uint			lock_mode;
+	xfs_off_t		start_offset;
+
+	vn_trace_entry(BHV_TO_VNODE(dir_bdp), "xfs_readdir",
+					       (inst_t *)__return_address);
+	dp = XFS_BHVTOI(dir_bdp);
+
+	if (XFS_FORCED_SHUTDOWN(dp->i_mount)) {
+		return XFS_ERROR(EIO);
+	}
+
+	lock_mode = xfs_ilock_map_shared(dp);
+
+	if ((dp->i_d.di_mode & IFMT) != IFDIR) {
+		xfs_iunlock_map_shared(dp, lock_mode);
+		return XFS_ERROR(ENOTDIR);
+	}
+
+	start_offset = uiop->uio_offset;
+	error = XFS_DIR_GETDENTS(dp->i_mount, tp, dp, uiop, eofp);
+	if (start_offset != uiop->uio_offset) {
+		xfs_ichgtime(dp, XFS_ICHGTIME_ACC);
+	}
+
+	xfs_iunlock_map_shared(dp, lock_mode);
+
+	return error;
+}
+
+/*
+ * xfs_symlink
+ *
+ */
+STATIC int
+xfs_symlink(
+	bhv_desc_t	*dir_bdp,
+	struct dentry	*dentry,
+	vattr_t		*vap,
+	char		*target_path,
+	vnode_t		**vpp,
+	cred_t		*credp)
+{
+	xfs_trans_t		*tp;
+	xfs_mount_t		*mp;
+	xfs_inode_t		*dp;
+	xfs_inode_t		*ip;
+	int			error;
+	int			pathlen;
+	xfs_ino_t		e_inum;
+	dev_t			rdev;
+	xfs_bmap_free_t		free_list;
+	xfs_fsblock_t		first_block;
+	boolean_t		dp_joined_to_trans;
+	vnode_t			*dir_vp;
+	uint			cancel_flags;
+	int			committed;
+	xfs_fileoff_t		first_fsb;
+	xfs_filblks_t		fs_blocks;
+	int			nmaps;
+	xfs_bmbt_irec_t		mval[SYMLINK_MAPS];
+	xfs_daddr_t		d;
+	char			*cur_chunk;
+	int			byte_cnt;
+	int			n;
+	xfs_buf_t		*bp;
+	xfs_prid_t		prid;
+	struct xfs_dquot	*udqp, *gdqp;
+	uint			resblks;
+	char			*link_name = (char *)dentry->d_name.name;
+	int			link_namelen;
+
+	*vpp = NULL;
+	dir_vp = BHV_TO_VNODE(dir_bdp);
+	dp = XFS_BHVTOI(dir_bdp);
+	dp_joined_to_trans = B_FALSE;
+	error = 0;
+	ip = NULL;
+	tp = NULL;
+
+	vn_trace_entry(dir_vp, "xfs_symlink", (inst_t *)__return_address);
+
+	mp = dp->i_mount;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	link_namelen = dentry->d_name.len;
+	if (link_namelen >= MAXNAMELEN)
+		return XFS_ERROR(ENAMETOOLONG);
+	/*
+	 * Check component lengths of the target path name.
+	 */
+	pathlen = strlen(target_path);
+	if (pathlen >= MAXPATHLEN)	/* total string too long */
+		return XFS_ERROR(ENAMETOOLONG);
+	if (pathlen >= MAXNAMELEN) {	/* is any component too long? */
+		int len, total;
+		char *path;
+
+		for(total = 0, path = target_path; total < pathlen;) {
+			/*
+			 * Skip any slashes.
+			 */
+			while(*path == '/') {
+				total++;
+				path++;
+			}
+
+			/*
+			 * Count up to the next slash or end of path.
+			 * Error out if the component is bigger than MAXNAMELEN.
+			 */
+			for(len = 0; *path != '/' && total < pathlen;total++, path++) {
+				if (++len >= MAXNAMELEN) {
+					error = ENAMETOOLONG;
+					return error;
+				}
+			}
+		}
+	}
+
+	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_SYMLINK)) {
+		error = dm_send_namesp_event(DM_EVENT_SYMLINK, dir_bdp, DM_RIGHT_NULL,
+						NULL, DM_RIGHT_NULL,
+						link_name, target_path,
+						0, 0, 0);
+		if (error)
+			return error;
+	}
+
+	/* Return through std_return after this point. */
+
+	udqp = gdqp = NULL;
+	if (vap->va_mask & AT_PROJID)
+		prid = (xfs_prid_t)vap->va_projid;
+	else
+		prid = (xfs_prid_t)dfltprid;
+
+	/*
+	 * Make sure that we have allocated dquot(s) on disk.
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		error = xfs_qm_vop_dqalloc(mp, dp,
+					   current->fsuid, current->fsgid,
+					   XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT,
+					   &udqp, &gdqp);
+		if (error)
+			goto std_return;
+	}
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
+	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+	/*
+	 * The symlink will fit into the inode data fork?
+	 * There can't be any attributes so we get the whole variable part.
+	 */
+	if (pathlen <= XFS_LITINO(mp))
+		fs_blocks = 0;
+	else
+		fs_blocks = XFS_B_TO_FSB(mp, pathlen);
+	resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
+	error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
+	if (error == ENOSPC && fs_blocks == 0) {
+		resblks = 0;
+		error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
+				XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
+	}
+	if (error) {
+		cancel_flags = 0;
+		dp = NULL;
+		goto error_return;
+	}
+
+	xfs_ilock(dp, XFS_ILOCK_EXCL);
+
+	/*
+	 * Since we've already started a transaction, we cannot allow
+	 * the lookup to do a vn_get().
+	 */
+	error = xfs_dir_lookup_int(dir_bdp, 0, dentry, &e_inum, NULL);
+	if (error != ENOENT) {
+		if (!error) {
+			error = XFS_ERROR(EEXIST);
+		}
+		goto error_return;
+	}
+	/*
+	 * Reserve disk quota : blocks and inode.
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if (xfs_trans_reserve_quota(tp, udqp, gdqp, resblks, 1, 0)) {
+			error = XFS_ERROR(EDQUOT);
+			goto error_return;
+		}
+	}
+
+	/*
+	 * Check for ability to enter directory entry, if no space reserved.
+	 */
+	if (resblks == 0 &&
+	    (error = XFS_DIR_CANENTER(mp, tp, dp, link_name, link_namelen)))
+		goto error_return;
+	/*
+	 * Initialize the bmap freelist prior to calling either
+	 * bmapi or the directory create code.
+	 */
+	XFS_BMAP_INIT(&free_list, &first_block);
+
+	/*
+	 * Allocate an inode for the symlink.
+	 */
+	rdev = (vap->va_mask & AT_RDEV) ? vap->va_rdev : 0;
+
+	error = xfs_dir_ialloc(&tp, dp, IFLNK | (vap->va_mode&~IFMT),
+			       1, rdev, credp, prid, resblks > 0, &ip, NULL);
+	if (error) {
+		if (error == ENOSPC)
+			goto error_return;
+		goto error1;
+	}
+	ITRACE(ip);
+
+	VN_HOLD(dir_vp);
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+	dp_joined_to_trans = B_TRUE;
+
+	/*
+	 * Also attach the dquot(s) to it, if applicable.
+	 */
+	if (XFS_IS_QUOTA_ON(mp)) {
+		xfs_qm_vop_dqattach_and_dqmod_newinode(tp, ip, udqp, gdqp);
+	}
+
+	if (resblks)
+		resblks -= XFS_IALLOC_SPACE_RES(mp);
+	/*
+	 * If the symlink will fit into the inode, write it inline.
+	 */
+	if (pathlen <= XFS_IFORK_DSIZE(ip)) {
+		xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
+		bcopy(target_path, ip->i_df.if_u1.if_data, pathlen);
+		ip->i_d.di_size = pathlen;
+
+		/*
+		 * The inode was initially created in extent format.
+		 */
+		ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
+		ip->i_df.if_flags |= XFS_IFINLINE;
+
+		ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
+
+	} else {
+		first_fsb = 0;
+		nmaps = SYMLINK_MAPS;
+
+		error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
+				  XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
+				  &first_block, resblks, mval, &nmaps,
+				  &free_list);
+		if (error) {
+			goto error1;
+		}
+
+		if (resblks)
+			resblks -= fs_blocks;
+		ip->i_d.di_size = pathlen;
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+		cur_chunk = target_path;
+		for (n = 0; n < nmaps; n++) {
+			d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+			byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+			bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+					       BTOBB(byte_cnt), 0);
+			ASSERT(bp && !XFS_BUF_GETERROR(bp));
+			if (pathlen < byte_cnt) {
+				byte_cnt = pathlen;
+			}
+			pathlen -= byte_cnt;
+
+			bcopy(cur_chunk, XFS_BUF_PTR(bp), byte_cnt);
+			cur_chunk += byte_cnt;
+
+			xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
+		}
+	}
+
+	/*
+	 * Create the directory entry for the symlink.
+	 */
+	error = XFS_DIR_CREATENAME(mp, tp, dp, link_name, link_namelen,
+			ip->i_ino, &first_block, &free_list, resblks);
+	if (error) {
+		goto error1;
+	}
+	xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+
+	/*
+	 * Bump the in memory version number of the parent directory
+	 * so that other processes accessing it will recognize that
+	 * the directory has changed.
+	 */
+	dp->i_gen++;
+
+	/*
+	 * If this is a synchronous mount, make sure that the
+	 * symlink transaction goes to disk before returning to
+	 * the user.
+	 */
+	if (mp->m_flags & XFS_MOUNT_WSYNC) {
+		xfs_trans_set_sync(tp);
+	}
+
+	/*
+	 * xfs_trans_commit normally decrements the vnode ref count
+	 * when it unlocks the inode. Since we want to return the
+	 * vnode to the caller, we bump the vnode ref count now.
+	 */
+	IHOLD(ip);
+
+	error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
+	if (error) {
+		goto error2;
+	}
+	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+	if (udqp)
+		xfs_qm_dqrele(udqp);
+	if (gdqp)
+		xfs_qm_dqrele(gdqp);
+
+	/* Fall through to std_return with error = 0 or errno from
+	 * xfs_trans_commit	*/
+std_return:
+	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
+			     DM_EVENT_POSTSYMLINK)) {
+		(void) dm_send_namesp_event(DM_EVENT_POSTSYMLINK,
+						dir_bdp, DM_RIGHT_NULL,
+						error? NULL:XFS_ITOBHV(ip),
+						DM_RIGHT_NULL,
+						link_name, target_path,
+						0, error, 0);
+	}
+
+	if (!error) {
+		vnode_t *vp;
+
+		ASSERT(ip);
+		vp = XFS_ITOV(ip);
+		*vpp = vp;
+	}
+	return error;
+
+ error2:
+	IRELE(ip);
+ error1:
+	xfs_bmap_cancel(&free_list);
+	cancel_flags |= XFS_TRANS_ABORT;
+ error_return:
+	xfs_trans_cancel(tp, cancel_flags);
+	if (udqp)
+		xfs_qm_dqrele(udqp);
+	if (gdqp)
+		xfs_qm_dqrele(gdqp);
+
+	if (!dp_joined_to_trans && (dp != NULL)) {
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
+	}
+
+	goto std_return;
+}
+
+
+/*
+ * xfs_fid2
+ *
+ * A fid routine that takes a pointer to a previously allocated
+ * fid structure (like xfs_fast_fid) but uses a 64 bit inode number.
+ */
+STATIC int
+xfs_fid2(
+	bhv_desc_t	*bdp,
+	fid_t		*fidp)
+{
+	xfs_inode_t	*ip;
+	xfs_fid2_t	*xfid;
+
+	vn_trace_entry(BHV_TO_VNODE(bdp), "xfs_fid2",
+		       (inst_t *)__return_address);
+	ASSERT(sizeof(fid_t) >= sizeof(xfs_fid2_t));
+
+	xfid = (xfs_fid2_t *)fidp;
+	ip = XFS_BHVTOI(bdp);
+	xfid->fid_len = sizeof(xfs_fid2_t) - sizeof(xfid->fid_len);
+	xfid->fid_pad = 0;
+	/*
+	 * use bcopy because the inode is a long long and there's no
+	 * assurance that xfid->fid_ino is properly aligned.
+	 */
+	bcopy(&ip->i_ino, &xfid->fid_ino, sizeof xfid->fid_ino);
+	xfid->fid_gen = ip->i_d.di_gen;
+
+	return 0;
+}
+
+
+/*
+ * xfs_rwlock
+ */
+int
+xfs_rwlock(
+	bhv_desc_t	*bdp,
+	vrwlock_t	locktype)
+{
+	xfs_inode_t	*ip;
+	vnode_t		*vp;
+
+	vp = BHV_TO_VNODE(bdp);
+	if (vp->v_type == VDIR)
+		return 1;
+	ip = XFS_BHVTOI(bdp);
+	if (locktype == VRWLOCK_WRITE) {
+		xfs_ilock(ip, XFS_IOLOCK_EXCL);
+	} else if (locktype == VRWLOCK_TRY_READ) {
+		return (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED));
+	} else if (locktype == VRWLOCK_TRY_WRITE) {
+		return (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL));
+	} else {
+		ASSERT((locktype == VRWLOCK_READ) ||
+		       (locktype == VRWLOCK_WRITE_DIRECT));
+		xfs_ilock(ip, XFS_IOLOCK_SHARED);
+	}
+
+	return 1;
+}
+
+
+/*
+ * xfs_rwunlock
+ */
+void
+xfs_rwunlock(
+	bhv_desc_t	*bdp,
+	vrwlock_t	locktype)
+{
+	xfs_inode_t	*ip;
+	xfs_inode_t	*release_ip;
+	vnode_t		*vp;
+	int		error;
+
+	vp = BHV_TO_VNODE(bdp);
+	if (vp->v_type == VDIR)
+		return;
+	ip = XFS_BHVTOI(bdp);
+	if (locktype == VRWLOCK_WRITE) {
+		/*
+		 * In the write case, we may have added a new entry to
+		 * the reference cache.	 This might store a pointer to
+		 * an inode to be released in this inode.  If it is there,
+		 * clear the pointer and release the inode after unlocking
+		 * this one.
+		 */
+		release_ip = ip->i_release;
+		ip->i_release = NULL;
+		xfs_iunlock (ip, XFS_IOLOCK_EXCL);
+
+		if (release_ip != NULL) {
+			VOP_RELEASE(XFS_ITOV(release_ip), error);
+			VN_RELE(XFS_ITOV(release_ip));
+		}
+	} else {
+		ASSERT((locktype == VRWLOCK_READ) ||
+		       (locktype == VRWLOCK_WRITE_DIRECT));
+		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+	}
+	return;
+}
+
+STATIC int
+xfs_inode_flush(bhv_desc_t	*bdp,
+		int		flags)
+{
+	xfs_inode_t	*ip;
+	xfs_dinode_t	*dip;
+	xfs_mount_t	*mp;
+	xfs_buf_t	*bp;
+	int		error = 0;
+
+	ip = XFS_BHVTOI(bdp);
+	mp = ip->i_mount;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	/* Bypass inodes which have already been cleaned by
+	 * the inode flush clustering code inside xfs_iflush
+	 */
+	if ((ip->i_update_core == 0) &&
+	    ((ip->i_itemp == NULL) ||
+	     !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)))
+		return 0;
+
+	if (flags & FLUSH_LOG) {
+		xfs_inode_log_item_t *iip = ip->i_itemp;
+ 
+		if (iip && iip->ili_last_lsn) {
+			xlog_t	*log = mp->m_log;
+			xfs_lsn_t	sync_lsn;
+			int		s, log_flags = XFS_LOG_FORCE;
+
+			s = GRANT_LOCK(log);
+			sync_lsn = log->l_last_sync_lsn;
+			GRANT_UNLOCK(log, s);
+
+			if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) <= 0))
+				return 0;
+
+			if (flags & FLUSH_SYNC)
+				log_flags |= XFS_LOG_SYNC;
+			return xfs_log_force(mp, iip->ili_last_lsn,
+						log_flags);
+		}
+	}
+
+	/* We make this non-blocking if the inode is contended,
+	 * return EAGAIN to indicate to the caller that they
+	 * did not succeed. This prevents the flush path from
+	 * blocking on inodes inside another operation right
+	 * now, they get caught later by xfs_sync.
+	 */
+	if (flags & FLUSH_INODE) {
+		if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+			if ((xfs_ipincount(ip) == 0) && xfs_iflock_nowait(ip)) {
+				int	flush_flags;
+
+#if 0
+				/* not turning this on until some
+				 * performance analysis is done
+				 */
+				if (flags & FLUSH_SYNC)
+					flush_flags = XFS_IFLUSH_SYNC;
+				else
+#endif
+					flush_flags = XFS_IFLUSH_DELWRI;
+
+				xfs_ifunlock(ip);
+				xfs_iunlock(ip, XFS_ILOCK_SHARED);
+				error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0);
+				if (error)
+					goto eagain;
+				xfs_buf_relse(bp);
+
+				if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED) == 0)
+					goto eagain;
+
+				if ((xfs_ipincount(ip) == 0) &&
+				     xfs_iflock_nowait(ip))
+					error = xfs_iflush(ip, flush_flags);
+			} else {
+				error = EAGAIN;
+			}
+			xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		} else {
+eagain:
+			error = EAGAIN;
+		}
+	}
+
+	return error;
+}
+
+
+int
+xfs_set_dmattrs (
+	bhv_desc_t	*bdp,
+	u_int		evmask,
+	u_int16_t	state,
+	cred_t		*credp)
+{
+	xfs_inode_t	*ip;
+	xfs_trans_t	*tp;
+	xfs_mount_t	*mp;
+	int		error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return XFS_ERROR(EPERM);
+
+	ip = XFS_BHVTOI(bdp);
+	mp = ip->i_mount;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
+	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+	ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask = evmask;
+	ip->i_iocore.io_dmstate	 = ip->i_d.di_dmstate  = state;
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	IHOLD(ip);
+	error = xfs_trans_commit(tp, 0, NULL);
+
+	return error;
+}
+
+
+/*
+ * xfs_reclaim
+ */
+STATIC int
+xfs_reclaim(
+	bhv_desc_t	*bdp)
+{
+	xfs_inode_t	*ip;
+	vnode_t		*vp;
+
+	vp = BHV_TO_VNODE(bdp);
+
+	vn_trace_entry(vp, "xfs_reclaim", (inst_t *)__return_address);
+
+	ASSERT(!VN_MAPPED(vp));
+	ip = XFS_BHVTOI(bdp);
+
+	if ((ip->i_d.di_mode & IFMT) == IFREG) {
+		if (ip->i_d.di_size > 0) {
+			/*
+			 * Flush and invalidate any data left around that is
+			 * a part of this file.
+			 *
+			 * Get the inode's i/o lock so that buffers are pushed
+			 * out while holding the proper lock.  We can't hold
+			 * the inode lock here since flushing out buffers may
+			 * cause us to try to get the lock in xfs_strategy().
+			 *
+			 * We don't have to call remapf() here, because there
+			 * cannot be any mapped file references to this vnode
+			 * since it is being reclaimed.
+			 */
+			xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+			/*
+			 * If we hit an IO error, we need to make sure that the
+			 * buffer and page caches of file data for
+			 * the file are tossed away. We don't want to use
+			 * VOP_FLUSHINVAL_PAGES here because we don't want dirty
+			 * pages to stay attached to the vnode, but be
+			 * marked P_BAD. pdflush/vnode_pagebad
+			 * hates that.
+			 */
+			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+				VOP_FLUSHINVAL_PAGES(vp, 0, -1, FI_NONE);
+			} else {
+				VOP_TOSS_PAGES(vp, 0, -1, FI_NONE);
+			}
+
+			ASSERT(VN_CACHED(vp) == 0);
+			ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) ||
+			       ip->i_delayed_blks == 0);
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+		} else if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+			/*
+			 * di_size field may not be quite accurate if we're
+			 * shutting down.
+			 */
+			VOP_TOSS_PAGES(vp, 0, -1, FI_NONE);
+			ASSERT(VN_CACHED(vp) == 0);
+		}
+	}
+
+	/* If we have nothing to flush with this inode then complete the
+	 * teardown now, otherwise break the link between the xfs inode
+	 * and the linux inode and clean up the xfs inode later. This
+	 * avoids flushing the inode to disk during the delete operation
+	 * itself.
+	 */
+	if (!ip->i_update_core && (ip->i_itemp == NULL)) {
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
+	} else {
+		xfs_mount_t	*mp = ip->i_mount;
+
+		/* Protect sync from us */
+		XFS_MOUNT_ILOCK(mp);
+		vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
+		XFS_MOUNT_IUNLOCK(mp);
+	}
+	return 0;
+}
+
+int
+xfs_finish_reclaim(
+	xfs_inode_t	*ip,
+	int		locked,
+	int		sync_mode)
+{
+	xfs_ihash_t	*ih = ip->i_hash;
+	int		error;
+
+	if (!locked)
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	/* The hash lock here protects a thread in xfs_iget_core from
+	 * racing with us on linking the inode back with a vnode.
+	 * Once we have the XFS_IRECLAIM flag set it will not touch
+	 * us.
+	 */
+	write_lock(&ih->ih_lock);
+	if (ip->i_flags & XFS_IRECLAIM || (!locked && XFS_ITOV_NULL(ip))) {
+		write_unlock(&ih->ih_lock);
+		if (!locked)
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		return(1);
+	}
+	ip->i_flags |= XFS_IRECLAIM;
+	write_unlock(&ih->ih_lock);
+
+	/*
+	 * If the inode is still dirty, then flush it out.  If the inode
+	 * is not in the AIL, then it will be OK to flush it delwri as
+	 * long as xfs_iflush() does not keep any references to the inode.
+	 * We leave that decision up to xfs_iflush() since it has the
+	 * knowledge of whether it's OK to simply do a delwri flush of
+	 * the inode or whether we need to wait until the inode is
+	 * pulled from the AIL.
+	 * We get the flush lock regardless, though, just to make sure
+	 * we don't free it while it is being flushed.
+	 */
+	if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		if (!locked) {
+			xfs_iflock(ip);
+		}
+
+		if (ip->i_update_core ||
+		    ((ip->i_itemp != NULL) &&
+		     (ip->i_itemp->ili_format.ilf_fields != 0))) {
+			error = xfs_iflush(ip, sync_mode);
+			/*
+			 * If we hit an error, typically because of filesystem
+			 * shutdown, we don't need to let vn_reclaim to know
+			 * because we're gonna reclaim the inode anyway.
+			 */
+			if (error) {
+				xfs_iunlock(ip, XFS_ILOCK_EXCL);
+				xfs_ireclaim(ip);
+				return (0);
+			}
+			xfs_iflock(ip); /* synchronize with xfs_iflush_done */
+		}
+
+		ASSERT(ip->i_update_core == 0);
+		ASSERT(ip->i_itemp == NULL ||
+		       ip->i_itemp->ili_format.ilf_fields == 0);
+	}
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	xfs_ireclaim(ip);
+	return 0;
+}
+
+/*
+ * xfs_alloc_file_space()
+ *	This routine allocates disk space for the given file.
+ *
+ *	If alloc_type == 0, this request is for an ALLOCSP type
+ *	request which will change the file size.  In this case, no
+ *	DMAPI event will be generated by the call.  A TRUNCATE event
+ *	will be generated later by xfs_setattr.
+ *
+ *	If alloc_type != 0, this request is for a RESVSP type
+ *	request, and a DMAPI DM_EVENT_WRITE will be generated if the
+ *	lower block boundary byte address is less than the file's
+ *	length.
+ *
+ * RETURNS:
+ *	 0 on success
+ *	errno on error
+ *
+ */
+int
+xfs_alloc_file_space(
+	xfs_inode_t		*ip,
+	xfs_off_t		offset,
+	xfs_off_t		len,
+	int			alloc_type,
+	int			attr_flags)
+{
+	xfs_filblks_t		allocated_fsb;
+	xfs_filblks_t		allocatesize_fsb;
+	int			committed;
+	xfs_off_t		count;
+	xfs_filblks_t		datablocks;
+	int			error;
+	xfs_fsblock_t		firstfsb;
+	xfs_bmap_free_t		free_list;
+	xfs_bmbt_irec_t		*imapp;
+	xfs_bmbt_irec_t		imaps[1];
+	xfs_mount_t		*mp;
+	int			numrtextents;
+	int			reccount;
+	uint			resblks;
+	int			rt;
+	int			rtextsize;
+	xfs_fileoff_t		startoffset_fsb;
+	xfs_trans_t		*tp;
+	int			xfs_bmapi_flags;
+
+	vn_trace_entry(XFS_ITOV(ip), "xfs_alloc_file_space",
+					       (inst_t *)__return_address);
+	mp = ip->i_mount;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return XFS_ERROR(EIO);
+
+	/*
+	 * determine if this is a realtime file
+	 */
+	if ((rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) != 0) {
+		if (ip->i_d.di_extsize)
+			rtextsize = ip->i_d.di_extsize;
+		else
+			rtextsize = mp->m_sb.sb_rextsize;
+	} else
+		rtextsize = 0;
+
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if (XFS_NOT_DQATTACHED(mp, ip)) {
+			if ((error = xfs_qm_dqattach(ip, 0)))
+				return error;
+		}
+	}
+
+	if (len <= 0)
+		return XFS_ERROR(EINVAL);
+
+	count = len;
+	error = 0;
+	imapp = &imaps[0];
+	reccount = 1;
+	xfs_bmapi_flags = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
+	startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
+	allocatesize_fsb = XFS_B_TO_FSB(mp, count);
+
+	/*	Generate a DMAPI event if needed.	*/
+	if (alloc_type != 0 && offset < ip->i_d.di_size &&
+			(attr_flags&ATTR_DMI) == 0  &&
+			DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
+		xfs_off_t	    end_dmi_offset;
+
+		end_dmi_offset = offset+len;
+		if (end_dmi_offset > ip->i_d.di_size)
+			end_dmi_offset = ip->i_d.di_size;
+		error = xfs_dm_send_data_event(DM_EVENT_WRITE, XFS_ITOBHV(ip),
+			offset, end_dmi_offset - offset,
+			0, NULL);
+		if (error)
+			return(error);
+	}
+
+	/*
+	 * allocate file space until done or until there is an error
+	 */
+retry:
+	while (allocatesize_fsb && !error) {
+		/*
+		 * determine if reserving space on
+		 * the data or realtime partition.
+		 */
+		if (rt) {
+			xfs_fileoff_t s, e;
+
+			s = startoffset_fsb;
+			do_div(s, rtextsize);
+			s *= rtextsize;
+			e = roundup_64(startoffset_fsb + allocatesize_fsb,
+				rtextsize);
+			numrtextents = (int)(e - s) / mp->m_sb.sb_rextsize;
+			datablocks = 0;
+		} else {
+			datablocks = allocatesize_fsb;
+			numrtextents = 0;
+		}
+
+		/*
+		 * allocate and setup the transaction
+		 */
+		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+		resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks);
+		error = xfs_trans_reserve(tp,
+					  resblks,
+					  XFS_WRITE_LOG_RES(mp),
+					  numrtextents,
+					  XFS_TRANS_PERM_LOG_RES,
+					  XFS_WRITE_LOG_COUNT);
+
+		/*
+		 * check for running out of space
+		 */
+		if (error) {
+			/*
+			 * Free the transaction structure.
+			 */
+			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+			xfs_trans_cancel(tp, 0);
+			break;
+		}
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		if (XFS_IS_QUOTA_ON(mp)) {
+			if (xfs_trans_reserve_quota(tp,
+						    ip->i_udquot,
+						    ip->i_gdquot,
+						    resblks, 0, 0)) {
+				error = XFS_ERROR(EDQUOT);
+				goto error1;
+			}
+		}
+
+		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+		xfs_trans_ihold(tp, ip);
+
+		/*
+		 * issue the bmapi() call to allocate the blocks
+		 */
+		XFS_BMAP_INIT(&free_list, &firstfsb);
+		error = xfs_bmapi(tp, ip, startoffset_fsb,
+				  allocatesize_fsb, xfs_bmapi_flags,
+				  &firstfsb, 0, imapp, &reccount,
+				  &free_list);
+		if (error) {
+			goto error0;
+		}
+
+		/*
+		 * complete the transaction
+		 */
+		error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
+		if (error) {
+			goto error0;
+		}
+
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		if (error) {
+			break;
+		}
+
+		allocated_fsb = imapp->br_blockcount;
+
+		if (reccount == 0) {
+			error = XFS_ERROR(ENOSPC);
+			break;
+		}
+
+		startoffset_fsb += allocated_fsb;
+		allocatesize_fsb -= allocated_fsb;
+	}
+dmapi_enospc_check:
+	if (error == ENOSPC && (attr_flags&ATTR_DMI) == 0 &&
+	    DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_NOSPACE)) {
+
+		error = dm_send_namesp_event(DM_EVENT_NOSPACE,
+				XFS_ITOBHV(ip), DM_RIGHT_NULL,
+				XFS_ITOBHV(ip), DM_RIGHT_NULL,
+				NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
+		if (error == 0)
+			goto retry;	/* Maybe DMAPI app. has made space */
+		/* else fall through with error = xfs_dm_send_data_event result. */
+	}
+
+	return error;
+
+ error0:
+	xfs_bmap_cancel(&free_list);
+ error1:
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	goto dmapi_enospc_check;
+}
+
+/*
+ * Zero file bytes between startoff and endoff inclusive.
+ * The iolock is held exclusive and no blocks are buffered.
+ */
+STATIC int
+xfs_zero_remaining_bytes(
+	xfs_inode_t		*ip,
+	xfs_off_t		startoff,
+	xfs_off_t		endoff)
+{
+	xfs_buf_t		*bp;
+	int			error=0;
+	xfs_bmbt_irec_t		imap;
+	xfs_off_t		lastoffset;
+	xfs_mount_t		*mp;
+	int			nimap;
+	xfs_off_t		offset;
+	xfs_fileoff_t		offset_fsb;
+
+	mp = ip->i_mount;
+	bp = XFS_ngetrbuf(mp->m_sb.sb_blocksize,mp);
+	ASSERT(!XFS_BUF_GETERROR(bp));
+
+	if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) {
+		XFS_BUF_SET_TARGET(bp, mp->m_rtdev_targp);
+	} else {
+		XFS_BUF_SET_TARGET(bp, mp->m_ddev_targp);
+	}
+
+	for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
+		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+		nimap = 1;
+		error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, NULL, 0, &imap,
+			&nimap, NULL);
+		if (error || nimap < 1)
+			break;
+		ASSERT(imap.br_blockcount >= 1);
+		ASSERT(imap.br_startoff == offset_fsb);
+		lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
+		if (lastoffset > endoff)
+			lastoffset = endoff;
+		if (imap.br_startblock == HOLESTARTBLOCK)
+			continue;
+		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+		if (imap.br_state == XFS_EXT_UNWRITTEN)
+			continue;
+		XFS_BUF_UNDONE(bp);
+		XFS_BUF_UNWRITE(bp);
+		XFS_BUF_READ(bp);
+		XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
+		xfsbdstrat(mp, bp);
+		if ((error = xfs_iowait(bp))) {
+			xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
+					  mp, bp, XFS_BUF_ADDR(bp));
+			break;
+		}
+		bzero(XFS_BUF_PTR(bp) +
+			(offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
+		      lastoffset - offset + 1);
+		XFS_BUF_UNDONE(bp);
+		XFS_BUF_UNREAD(bp);
+		XFS_BUF_WRITE(bp);
+		xfsbdstrat(mp, bp);
+		if ((error = xfs_iowait(bp))) {
+			xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
+					  mp, bp, XFS_BUF_ADDR(bp));
+			break;
+		}
+	}
+	XFS_nfreerbuf(bp);
+	return error;
+}
+
+/*
+ * xfs_free_file_space()
+ *	This routine frees disk space for the given file.
+ *
+ *	This routine is only called by xfs_change_file_space
+ *	for an UNRESVSP type call.
+ *
+ * RETURNS:
+ *	 0 on success
+ *	errno on error
+ *
+ */
+STATIC int
+xfs_free_file_space(
+	xfs_inode_t		*ip,
+	xfs_off_t		offset,
+	xfs_off_t		len,
+	int			attr_flags)
+{
+	int			committed;
+	int			done;
+	xfs_off_t		end_dmi_offset;
+	xfs_fileoff_t		endoffset_fsb;
+	int			error;
+	xfs_fsblock_t		firstfsb;
+	xfs_bmap_free_t		free_list;
+	xfs_off_t		ilen;
+	xfs_bmbt_irec_t		imap;
+	xfs_off_t		ioffset;
+	xfs_extlen_t		mod=0;
+	xfs_mount_t		*mp;
+	int			nimap;
+	uint			resblks;
+	int			rounding;
+	int			specrt;
+	xfs_fileoff_t		startoffset_fsb;
+	xfs_trans_t		*tp;
+
+	vn_trace_entry(XFS_ITOV(ip), "xfs_free_file_space",
+					       (inst_t *)__return_address);
+	mp = ip->i_mount;
+
+	if (XFS_IS_QUOTA_ON(mp)) {
+		if (XFS_NOT_DQATTACHED(mp, ip)) {
+			if ((error = xfs_qm_dqattach(ip, 0)))
+				return error;
+		}
+	}
+
+	error = 0;
+	if (len <= 0)	/* if nothing being freed */
+		return error;
+	specrt =
+		(ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
+		!XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb);
+	startoffset_fsb = XFS_B_TO_FSB(mp, offset);
+	end_dmi_offset = offset + len;
+	endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
+
+	if (offset < ip->i_d.di_size &&
+	    (attr_flags&ATTR_DMI) == 0	&&
+	    DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
+		if (end_dmi_offset > ip->i_d.di_size)
+			end_dmi_offset = ip->i_d.di_size;
+		error = xfs_dm_send_data_event(DM_EVENT_WRITE, XFS_ITOBHV(ip),
+			offset, end_dmi_offset - offset,
+			AT_DELAY_FLAG(attr_flags), NULL);
+		if (error)
+			return(error);
+	}
+
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+	rounding = MAX((__uint8_t)(1 << mp->m_sb.sb_blocklog),
+			(__uint8_t)NBPP);
+	ilen = len + (offset & (rounding - 1));
+	ioffset = offset & ~(rounding - 1);
+	if (ilen & (rounding - 1))
+		ilen = (ilen + rounding) & ~(rounding - 1);
+	xfs_inval_cached_pages(XFS_ITOV(ip), &(ip->i_iocore), ioffset, 0, 0);
+	/*
+	 * Need to zero the stuff we're not freeing, on disk.
+	 * If its specrt (realtime & can't use unwritten extents) then
+	 * we actually need to zero the extent edges.  Otherwise xfs_bunmapi
+	 * will take care of it for us.
+	 */
+	if (specrt) {
+		nimap = 1;
+		error = xfs_bmapi(NULL, ip, startoffset_fsb, 1, 0, NULL, 0,
+			&imap, &nimap, NULL);
+		if (error)
+			return error;
+		ASSERT(nimap == 0 || nimap == 1);
+		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+			xfs_daddr_t	block;
+
+			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+			block = imap.br_startblock;
+			mod = do_div(block, mp->m_sb.sb_rextsize);
+			if (mod)
+				startoffset_fsb += mp->m_sb.sb_rextsize - mod;
+		}
+		nimap = 1;
+		error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, 1, 0, NULL, 0,
+			&imap, &nimap, NULL);
+		if (error)
+			return error;
+		ASSERT(nimap == 0 || nimap == 1);
+		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
+			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+			mod++;
+			if (mod && (mod != mp->m_sb.sb_rextsize))
+				endoffset_fsb -= mod;
+		}
+	}
+	if ((done = (endoffset_fsb <= startoffset_fsb)))
+		/*
+		 * One contiguous piece to clear
+		 */
+		error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
+	else {
+		/*
+		 * Some full blocks, possibly two pieces to clear
+		 */
+		if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
+			error = xfs_zero_remaining_bytes(ip, offset,
+				XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
+		if (!error &&
+		    XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
+			error = xfs_zero_remaining_bytes(ip,
+				XFS_FSB_TO_B(mp, endoffset_fsb),
+				offset + len - 1);
+	}
+
+	/*
+	 * free file space until done or until there is an error
+	 */
+	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+	while (!error && !done) {
+
+		/*
+		 * allocate and setup the transaction
+		 */
+		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+		error = xfs_trans_reserve(tp,
+					  resblks,
+					  XFS_WRITE_LOG_RES(mp),
+					  0,
+					  XFS_TRANS_PERM_LOG_RES,
+					  XFS_WRITE_LOG_COUNT);
+
+		/*
+		 * check for running out of space
+		 */
+		if (error) {
+			/*
+			 * Free the transaction structure.
+			 */
+			ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
+			xfs_trans_cancel(tp, 0);
+			break;
+		}
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+		if (XFS_IS_QUOTA_ON(mp)) {
+			if (xfs_trans_reserve_quota(tp,
+						    ip->i_udquot,
+						    ip->i_gdquot,
+						    resblks, 0, 0)) {
+				error = XFS_ERROR(EDQUOT);
+				goto error1;
+			}
+		}
+
+		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+		xfs_trans_ihold(tp, ip);
+
+		/*
+		 * issue the bunmapi() call to free the blocks
+		 */
+		XFS_BMAP_INIT(&free_list, &firstfsb);
+		error = xfs_bunmapi(tp, ip, startoffset_fsb,
+				  endoffset_fsb - startoffset_fsb,
+				  0, 2, &firstfsb, &free_list, &done);
+		if (error) {
+			goto error0;
+		}
+
+		/*
+		 * complete the transaction
+		 */
+		error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
+		if (error) {
+			goto error0;
+		}
+
+		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	}
+
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
+
+ error0:
+	xfs_bmap_cancel(&free_list);
+ error1:
+	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	return error;
+}
+
+/*
+ * xfs_change_file_space()
+ *	This routine allocates or frees disk space for the given file.
+ *	The user specified parameters are checked for alignment and size
+ *	limitations.
+ *
+ * RETURNS:
+ *	 0 on success
+ *	errno on error
+ *
+ */
+int
+xfs_change_file_space(
+	bhv_desc_t	*bdp,
+	int		cmd,
+	xfs_flock64_t	*bf,
+	xfs_off_t	offset,
+	cred_t		*credp,
+	int		attr_flags)
+{
+	int		clrprealloc;
+	int		error;
+	xfs_fsize_t	fsize;
+	xfs_inode_t	*ip;
+	xfs_mount_t	*mp;
+	int		setprealloc;
+	xfs_off_t	startoffset;
+	xfs_off_t	llen;
+	xfs_trans_t	*tp;
+	vattr_t		va;
+	vnode_t		*vp;
+
+	vp = BHV_TO_VNODE(bdp);
+
+	vn_trace_entry(vp, "xfs_change_file_space",
+						(inst_t *)__return_address);
+	ip = XFS_BHVTOI(bdp);
+	mp = ip->i_mount;
+
+	/*
+	 * must be a regular file and have write permission
+	 */
+	if (vp->v_type != VREG)
+		return XFS_ERROR(EINVAL);
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+	if ((error = xfs_iaccess(ip, IWRITE, credp))) {
+		xfs_iunlock(ip, XFS_ILOCK_SHARED);
+		return error;
+	}
+
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	switch (bf->l_whence) {
+	case 0: /*SEEK_SET*/
+		break;
+	case 1: /*SEEK_CUR*/
+		bf->l_start += offset;
+		break;
+	case 2: /*SEEK_END*/
+		bf->l_start += ip->i_d.di_size;
+		break;
+	default:
+		return XFS_ERROR(EINVAL);
+	}
+
+	llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
+
+	if (   (bf->l_start < 0)
+	    || (bf->l_start > XFS_MAX_FILE_OFFSET)
+	    || (bf->l_start + llen < 0)
+	    || (bf->l_start + llen > XFS_MAX_FILE_OFFSET))
+		return XFS_ERROR(EINVAL);
+
+	bf->l_whence = 0;
+
+	startoffset = bf->l_start;
+	fsize = ip->i_d.di_size;
+
+	/*
+	 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
+	 * file space.
+	 * These calls do NOT zero the data space allocated to the file,
+	 * nor do they change the file size.
+	 *
+	 * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
+	 * space.
+	 * These calls cause the new file data to be zeroed and the file
+	 * size to be changed.
+	 */
+	setprealloc = clrprealloc = 0;
+
+	switch (cmd) {
+	case XFS_IOC_RESVSP:
+	case XFS_IOC_RESVSP64:
+		error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
+								1, attr_flags);
+		if (error)
+			return error;
+		setprealloc = 1;
+		break;
+
+	case XFS_IOC_UNRESVSP:
+	case XFS_IOC_UNRESVSP64:
+		if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
+								attr_flags)))
+			return error;
+		break;
+
+	case XFS_IOC_ALLOCSP:
+	case XFS_IOC_ALLOCSP64:
+	case XFS_IOC_FREESP:
+	case XFS_IOC_FREESP64:
+		if (startoffset > fsize) {
+			error = xfs_alloc_file_space(ip, fsize,
+					startoffset - fsize, 0, attr_flags);
+			if (error)
+				break;
+		}
+
+		va.va_mask = AT_SIZE;
+		va.va_size = startoffset;
+
+		error = xfs_setattr(bdp, &va, attr_flags, credp);
+
+		if (error)
+			return error;
+
+		clrprealloc = 1;
+		break;
+
+	default:
+		ASSERT(0);
+		return XFS_ERROR(EINVAL);
+	}
+
+	/*
+	 * update the inode timestamp, mode, and prealloc flag bits
+	 */
+	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
+
+	if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
+				      0, 0, 0))) {
+		/* ASSERT(0); */
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_ihold(tp, ip);
+
+	ip->i_d.di_mode &= ~ISUID;
+
+	/*
+	 * Note that we don't have to worry about mandatory
+	 * file locking being disabled here because we only
+	 * clear the ISGID bit if the Group execute bit is
+	 * on, but if it was on then mandatory locking wouldn't
+	 * have been enabled.
+	 */
+	if (ip->i_d.di_mode & (IEXEC >> 3))
+		ip->i_d.di_mode &= ~ISGID;
+
+	xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+
+	if (setprealloc)
+		ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
+	else if (clrprealloc)
+		ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	xfs_trans_set_sync(tp);
+
+	error = xfs_trans_commit(tp, 0, NULL);
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	return error;
+}
+
+vnodeops_t xfs_vnodeops = {
+	.vop_open		= xfs_open,
+	.vop_read		= xfs_read,
+	.vop_write		= xfs_write,
+	.vop_ioctl		= xfs_ioctl,
+	.vop_getattr		= xfs_getattr,
+	.vop_setattr		= xfs_setattr,
+	.vop_access		= xfs_access,
+	.vop_lookup		= xfs_lookup,
+	.vop_create		= xfs_create,
+	.vop_remove		= xfs_remove,
+	.vop_link		= xfs_link,
+	.vop_rename		= xfs_rename,
+	.vop_mkdir		= xfs_mkdir,
+	.vop_rmdir		= xfs_rmdir,
+	.vop_readdir		= xfs_readdir,
+	.vop_symlink		= xfs_symlink,
+	.vop_readlink		= xfs_readlink,
+	.vop_fsync		= xfs_fsync,
+	.vop_inactive		= xfs_inactive,
+	.vop_fid2		= xfs_fid2,
+	.vop_rwlock		= xfs_rwlock,
+	.vop_rwunlock		= xfs_rwunlock,
+	.vop_bmap		= xfs_bmap,
+	.vop_strategy		= xfs_strategy,
+	.vop_reclaim		= xfs_reclaim,
+	.vop_attr_get		= xfs_attr_get,
+	.vop_attr_set		= xfs_attr_set,
+	.vop_attr_remove	= xfs_attr_remove,
+	.vop_attr_list		= xfs_attr_list,
+	.vop_link_removed	= (vop_link_removed_t)fs_noval,
+	.vop_vnode_change	= (vop_vnode_change_t)fs_noval,
+	.vop_tosspages		= fs_tosspages,
+	.vop_flushinval_pages	= fs_flushinval_pages,
+	.vop_flush_pages	= fs_flush_pages,
+	.vop_release		= xfs_release,
+	.vop_iflush		= xfs_inode_flush,
+};
diff --git a/include/linux/sched.h b/include/linux/sched.h
index bd7073fdefaf..a7185e89d899 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -411,6 +411,7 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
 #define PF_IOTHREAD	0x00020000	/* this thread is needed for doing I/O to swap */
 #define PF_FROZEN	0x00040000	/* frozen for system suspend */
 #define PF_SYNC		0x00080000	/* performing fsync(), etc */
+#define PF_FSTRANS	0x00100000	/* inside a filesystem transaction */
 
 /*
  * Ptrace flags
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 4856854660cc..cb24824393ea 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -151,6 +151,7 @@ enum
 	VM_DIRTY_EXPIRE_CS=15,	/* dirty_expire_centisecs */
 	VM_NR_PDFLUSH_THREADS=16, /* nr_pdflush_threads */
 	VM_OVERCOMMIT_RATIO=17, /* percent of RAM to allow overcommit in */
+	VM_PAGEBUF=18		/* struct: Control pagebuf parameters */
 };
 
 
@@ -554,6 +555,7 @@ enum
 	FS_DIR_NOTIFY=14,	/* int: directory notification enabled */
 	FS_LEASE_TIME=15,	/* int: maximum time to wait for a lease break */
 	FS_DQSTATS=16,	/* disc quota usage statistics */
+	FS_XFS=17,	/* struct: control xfs parameters */
 };
 
 /* /proc/sys/fs/quota/ */
diff --git a/kernel/ksyms.c b/kernel/ksyms.c
index 3b03394fe14d..97ee2b0315a9 100644
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -278,6 +278,7 @@ EXPORT_SYMBOL(find_lock_page);
 EXPORT_SYMBOL(find_or_create_page);
 EXPORT_SYMBOL(grab_cache_page_nowait);
 EXPORT_SYMBOL(read_cache_page);
+EXPORT_SYMBOL(mark_page_accessed);
 EXPORT_SYMBOL(vfs_readlink);
 EXPORT_SYMBOL(vfs_follow_link);
 EXPORT_SYMBOL(page_readlink);
-- 
cgit v1.2.3


From 63540ceac6d79d7703dfe6e2c161f342a03f8a11 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 14 Sep 2002 21:20:59 -0700
Subject: [PATCH] thread-exec-2.5.34-B1, BK-curr

This implements one of the last missing POSIX threading details - exec()
semantics.  Previous kernels had code that tried to handle it, but that
code had a number of disadvantages:

 - it only worked if the exec()-ing thread was the thread group leader,
   creating an assymetry. This does not work if the thread group leader
   has exited already.

 - it was racy: it sent a SIGKILL to every thread in the group but did not
   wait for them to actually process the SIGKILL. It did a yield() but
   that is not enough. All 'other' threads have to finish processing
   before we can continue with the exec().

This adds the same logic, but extended with the following enhancements:

 - works from non-leader threads just as much as the thread group leader.

 - waits for all other threads to exit before continuing with the exec().

 - reuses the PID of the group.

It would perhaps be a more generic approach to add a new syscall,
sys_ungroup() - which would do largely what de_thread() does in this
patch.

But it's not really needed now - posix_spawn() is currently implemented
via starting a non-CLONE_THREAD helper thread that does a sys_exec().
There's no API currently that needs a direct exec() from a thread - but
it could be created (such as pthread_exec_np()).  It would have the
advantage of not having to go through a helper thread, but the
difference is minimal.
---
 fs/exec.c             | 177 ++++++++++++++++++++++++++++++++++++--------------
 include/linux/sched.h |   3 +
 kernel/exit.c         |   2 +-
 kernel/fork.c         |   1 +
 kernel/signal.c       |  10 +++
 5 files changed, 145 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 317b1c395ef2..289dd068bc36 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -40,6 +40,7 @@
 #define __NO_VERSION__
 #include <linux/module.h>
 #include <linux/namei.h>
+#include <linux/proc_fs.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
@@ -493,52 +494,151 @@ static int exec_mmap(struct mm_struct *mm)
 	return 0;
 }
 
+static struct dentry *clean_proc_dentry(struct task_struct *p)
+{
+	struct dentry *proc_dentry = p->proc_dentry;
+
+	if (proc_dentry) {
+		spin_lock(&dcache_lock);
+		if (!list_empty(&proc_dentry->d_hash)) {
+			dget_locked(proc_dentry);
+			list_del_init(&proc_dentry->d_hash);
+		} else
+			proc_dentry = NULL;
+		spin_unlock(&dcache_lock);
+	}
+	return proc_dentry;
+}
+
+static inline void put_proc_dentry(struct dentry *dentry)
+{
+	if (dentry) {
+		shrink_dcache_parent(dentry);
+		dput(dentry);
+	}
+}
+
 /*
  * This function makes sure the current process has its own signal table,
  * so that flush_signal_handlers can later reset the handlers without
  * disturbing other processes.  (Other processes might share the signal
  * table via the CLONE_SIGNAL option to clone().)
  */
- 
-static inline int make_private_signals(void)
+static inline int de_thread(struct signal_struct *oldsig)
 {
-	struct signal_struct * newsig;
-
-	remove_thread_group(current, current->sig);
+	struct signal_struct *newsig;
+	int count;
 
 	if (atomic_read(&current->sig->count) <= 1)
 		return 0;
+
 	newsig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL);
-	if (newsig == NULL)
+	if (!newsig)
 		return -ENOMEM;
+
+	if (list_empty(&current->thread_group))
+		goto out;
+	/*
+	 * Kill all other threads in the thread group:
+	 */
+	spin_lock_irq(&oldsig->siglock);
+	if (oldsig->group_exit) {
+		/*
+		 * Another group action in progress, just
+		 * return so that the signal is processed.
+		 */
+		spin_unlock_irq(&oldsig->siglock);
+		kmem_cache_free(sigact_cachep, newsig);
+		return -EAGAIN;
+	}
+	oldsig->group_exit = 1;
+	__broadcast_thread_group(current, SIGKILL);
+
+	/*
+	 * Account for the thread group leader hanging around:
+	 */
+	count = 2;
+	if (current->pid == current->tgid)
+		count = 1;
+	while (atomic_read(&oldsig->count) > count) {
+		oldsig->group_exit_task = current;
+		current->state = TASK_UNINTERRUPTIBLE;
+		spin_unlock_irq(&oldsig->siglock);
+		schedule();
+		spin_lock_irq(&oldsig->siglock);
+		if (oldsig->group_exit_task)
+			BUG();
+	}
+	spin_unlock_irq(&oldsig->siglock);
+
+	/*
+	 * At this point all other threads have exited, all we have to
+	 * do is to wait for the thread group leader to become inactive,
+	 * and to assume its PID:
+	 */
+	if (current->pid != current->tgid) {
+		struct task_struct *leader = current->group_leader;
+		struct dentry *proc_dentry1, *proc_dentry2;
+		unsigned long state;
+
+		wait_task_inactive(leader);
+
+		write_lock_irq(&tasklist_lock);
+		proc_dentry1 = clean_proc_dentry(current);
+		proc_dentry2 = clean_proc_dentry(leader);
+
+		if (leader->tgid != current->tgid)
+			BUG();
+		if (current->pid == current->tgid)
+			BUG();
+		/*
+		 * An exec() starts a new thread group with the
+		 * TGID of the previous thread group. Rehash the
+		 * two threads with a switched PID, and release
+		 * the former thread group leader:
+		 */
+		unhash_pid(current);
+		unhash_pid(leader);
+		leader->pid = leader->tgid = current->pid;
+		current->pid = current->tgid;
+		hash_pid(current);
+		hash_pid(leader);
+		
+		list_add_tail(&current->tasks, &init_task.tasks);
+		state = leader->state;
+		write_unlock_irq(&tasklist_lock);
+
+		if (state == TASK_ZOMBIE)
+			release_task(leader);
+
+		put_proc_dentry(proc_dentry1);
+		put_proc_dentry(proc_dentry2);
+        }
+
+out:
 	spin_lock_init(&newsig->siglock);
 	atomic_set(&newsig->count, 1);
 	newsig->group_exit = 0;
 	newsig->group_exit_code = 0;
+	newsig->group_exit_task = NULL;
 	memcpy(newsig->action, current->sig->action, sizeof(newsig->action));
 	init_sigpending(&newsig->shared_pending);
 
+	remove_thread_group(current, current->sig);
 	spin_lock_irq(&current->sigmask_lock);
 	current->sig = newsig;
 	spin_unlock_irq(&current->sigmask_lock);
-	return 0;
-}
-	
-/*
- * If make_private_signals() made a copy of the signal table, decrement the
- * refcount of the original table, and free it if necessary.
- * We don't do that in make_private_signals() so that we can back off
- * in flush_old_exec() if an error occurs after calling make_private_signals().
- */
 
-static inline void release_old_signals(struct signal_struct * oldsig)
-{
-	if (current->sig == oldsig)
-		return;
 	if (atomic_dec_and_test(&oldsig->count))
 		kmem_cache_free(sigact_cachep, oldsig);
-}
 
+	if (!list_empty(&current->thread_group))
+		BUG();
+	if (current->tgid != current->pid)
+		BUG();
+	return 0;
+}
+	
 /*
  * These functions flushes out all traces of the currently running executable
  * so that a new one can be started
@@ -572,44 +672,27 @@ static inline void flush_old_files(struct files_struct * files)
 	write_unlock(&files->file_lock);
 }
 
-/*
- * An execve() will automatically "de-thread" the process.
- * - if a master thread (PID==TGID) is doing this, then all subsidiary threads
- *   will be killed (otherwise there will end up being two independent thread
- *   groups with the same TGID).
- * - if a subsidary thread is doing this, then it just leaves the thread group
- */
-static void de_thread(struct task_struct *tsk)
-{
-	if (!list_empty(&tsk->thread_group))
-		BUG();
-	/* An exec() starts a new thread group: */
-	tsk->tgid = tsk->pid;
-}
-
 int flush_old_exec(struct linux_binprm * bprm)
 {
 	char * name;
 	int i, ch, retval;
-	struct signal_struct * oldsig;
-
-	/*
-	 * Make sure we have a private signal table
-	 */
-	oldsig = current->sig;
-	retval = make_private_signals();
-	if (retval) goto flush_failed;
+	struct signal_struct * oldsig = current->sig;
 
 	/* 
 	 * Release all of the old mmap stuff
 	 */
 	retval = exec_mmap(bprm->mm);
-	if (retval) goto mmap_failed;
+	if (retval)
+		goto mmap_failed;
+	/*
+	 * Make sure we have a private signal table and that
+	 * we are unassociated from the previous thread group.
+	 */
+	retval = de_thread(oldsig);
+	if (retval)
+		goto flush_failed;
 
 	/* This is the point of no return */
-	de_thread(current);
-
-	release_old_signals(oldsig);
 
 	current->sas_ss_sp = current->sas_ss_size = 0;
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5a7e7c21009f..97905a0e3d9d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -222,6 +222,8 @@ struct signal_struct {
 	/* thread group exit support */
 	int			group_exit;
 	int			group_exit_code;
+
+	struct task_struct	*group_exit_task;
 };
 
 /*
@@ -552,6 +554,7 @@ extern int dequeue_signal(struct sigpending *pending, sigset_t *mask, siginfo_t
 extern void block_all_signals(int (*notifier)(void *priv), void *priv,
 			      sigset_t *mask);
 extern void unblock_all_signals(void);
+extern void release_task(struct task_struct * p);
 extern int send_sig_info(int, struct siginfo *, struct task_struct *);
 extern int force_sig_info(int, struct siginfo *, struct task_struct *);
 extern int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp);
diff --git a/kernel/exit.c b/kernel/exit.c
index 36cfce0096db..cea498d24e39 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -49,7 +49,7 @@ static struct dentry * __unhash_process(struct task_struct *p)
 	return proc_dentry;
 }
 
-static void release_task(struct task_struct * p)
+void release_task(struct task_struct * p)
 {
 	struct dentry *proc_dentry;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 1d2dc3d5b351..b95e5aa0e0de 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -633,6 +633,7 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t
 	atomic_set(&sig->count, 1);
 	sig->group_exit = 0;
 	sig->group_exit_code = 0;
+	sig->group_exit_task = NULL;
 	memcpy(sig->action, current->sig->action, sizeof(sig->action));
 	sig->curr_target = NULL;
 	init_sigpending(&sig->shared_pending);
diff --git a/kernel/signal.c b/kernel/signal.c
index 9952cad2d85b..c2c92616fcea 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -273,6 +273,15 @@ void __exit_sighand(struct task_struct *tsk)
 		kmem_cache_free(sigact_cachep, sig);
 	} else {
 		struct task_struct *leader = tsk->group_leader;
+
+		/*
+		 * If there is any task waiting for the group exit
+		 * then notify it:
+		 */
+		if (sig->group_exit_task && atomic_read(&sig->count) <= 2) {
+			wake_up_process(sig->group_exit_task);
+			sig->group_exit_task = NULL;
+		}
 		/*
 		 * If we are the last non-leader member of the thread
 		 * group, and the leader is zombie, then notify the
@@ -283,6 +292,7 @@ void __exit_sighand(struct task_struct *tsk)
 		 */
 		if (atomic_read(&sig->count) == 1 &&
 					leader->state == TASK_ZOMBIE) {
+
 			__remove_thread_group(tsk, sig);
 			spin_unlock(&sig->siglock);
 			do_notify_parent(leader, leader->exit_signal);
-- 
cgit v1.2.3


From 9325c6843037167d789b524228e2d81fae20eac6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@home.transmeta.com>
Date: Sat, 14 Sep 2002 21:35:42 -0700
Subject: Use CLONE_KERNEL for the common kernel thread flags.

---
 drivers/char/hvc_console.c | 2 +-
 drivers/pnp/pnpbios_core.c | 2 +-
 fs/exec.c                  | 2 +-
 fs/lockd/clntlock.c        | 2 +-
 include/linux/sched.h      | 6 +++++-
 init/main.c                | 2 +-
 kernel/sched.c             | 3 +--
 kernel/softirq.c           | 3 +--
 mm/pdflush.c               | 3 +--
 mm/vmscan.c                | 2 +-
 10 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/hvc_console.c b/drivers/char/hvc_console.c
index dc2916b898de..2f98c3812d48 100644
--- a/drivers/char/hvc_console.c
+++ b/drivers/char/hvc_console.c
@@ -286,7 +286,7 @@ int __init hvc_init(void)
 		panic("Couldn't register hvc console driver\n");
 
 	if (hvc_driver.num > 0)
-		kernel_thread(khvcd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+		kernel_thread(khvcd, NULL, CLONE_KERNEL);
 
 	return 0;
 }
diff --git a/drivers/pnp/pnpbios_core.c b/drivers/pnp/pnpbios_core.c
index f361240fd8ce..d481b3c64b08 100644
--- a/drivers/pnp/pnpbios_core.c
+++ b/drivers/pnp/pnpbios_core.c
@@ -1299,7 +1299,7 @@ static int __init pnpbios_thread_init(void)
 {
 #ifdef CONFIG_HOTPLUG	
 	init_completion(&unload_sem);
-	if(kernel_thread(pnp_dock_thread, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL)>0)
+	if (kernel_thread(pnp_dock_thread, NULL, CLONE_KERNEL) > 0)
 		unloading = 0;
 #endif		
 	return 0;
diff --git a/fs/exec.c b/fs/exec.c
index 289dd068bc36..ade11d37f3d9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -522,7 +522,7 @@ static inline void put_proc_dentry(struct dentry *dentry)
  * This function makes sure the current process has its own signal table,
  * so that flush_signal_handlers can later reset the handlers without
  * disturbing other processes.  (Other processes might share the signal
- * table via the CLONE_SIGNAL option to clone().)
+ * table via the CLONE_SIGHAND option to clone().)
  */
 static inline int de_thread(struct signal_struct *oldsig)
 {
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 239fa43429bb..c5110c48422b 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -188,7 +188,7 @@ nlmclnt_recovery(struct nlm_host *host, u32 newstate)
 		nlmclnt_prepare_reclaim(host, newstate);
 		nlm_get_host(host);
 		MOD_INC_USE_COUNT;
-		kernel_thread(reclaimer, host, CLONE_SIGNAL);
+		kernel_thread(reclaimer, host, CLONE_KERNEL);
 	}
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 97905a0e3d9d..e6c24f2bfadd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -51,7 +51,11 @@ struct exec_domain;
 #define CLONE_CLEARTID	0x00200000	/* clear the userspace TID */
 #define CLONE_DETACHED	0x00400000	/* parent wants no child-exit signal */
 
-#define CLONE_SIGNAL	(CLONE_SIGHAND | CLONE_THREAD)
+/*
+ * List of flags we want to share for kernel threads,
+ * if only because they are not used by them anyway.
+ */
+#define CLONE_KERNEL	(CLONE_FS | CLONE_FILES | CLONE_SIGHAND)
 
 /*
  * These are the constant used to fake the fixed-point load-average
diff --git a/init/main.c b/init/main.c
index b484d14df3a2..7229f7aae2ea 100644
--- a/init/main.c
+++ b/init/main.c
@@ -371,7 +371,7 @@ static void __init smp_init(void)
 
 static void rest_init(void)
 {
-	kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+	kernel_thread(init, NULL, CLONE_KERNEL);
 	unlock_kernel();
  	cpu_idle();
 } 
diff --git a/kernel/sched.c b/kernel/sched.c
index ae8497903779..af2128479290 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2055,8 +2055,7 @@ static int migration_call(struct notifier_block *nfb,
 	case CPU_ONLINE:
 		printk("Starting migration thread for cpu %li\n",
 		       (long)hcpu);
-		kernel_thread(migration_thread, hcpu,
-			      CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+		kernel_thread(migration_thread, hcpu, CLONE_KERNEL);
 		while (!cpu_rq((long)hcpu)->migration_thread)
 			yield();
 		break;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 14cff3f400f8..7b717b39c66e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -395,8 +395,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 	int hotcpu = (unsigned long)hcpu;
 
 	if (action == CPU_ONLINE) {
-		if (kernel_thread(ksoftirqd, hcpu,
-				  CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0) {
+		if (kernel_thread(ksoftirqd, hcpu, CLONE_KERNEL) < 0) {
 			printk("ksoftirqd for %i failed\n", hotcpu);
 			return NOTIFY_BAD;
 		}
diff --git a/mm/pdflush.c b/mm/pdflush.c
index a380584744ae..d5b5841ef0d9 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -202,8 +202,7 @@ int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
 
 static void start_one_pdflush_thread(void)
 {
-	kernel_thread(pdflush, NULL,
-			CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+	kernel_thread(pdflush, NULL, CLONE_KERNEL);
 }
 
 static int __init pdflush_init(void)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9b3da0460db0..a0979a8cb88c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -705,7 +705,7 @@ static int __init kswapd_init(void)
 {
 	printk("Starting kswapd\n");
 	swap_setup();
-	kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+	kernel_thread(kswapd, NULL, CLONE_KERNEL);
 	return 0;
 }
 
-- 
cgit v1.2.3


From e572ef2ea320724ba32094c4b4817dfde4a4bef3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 15 Sep 2002 08:50:19 -0700
Subject: [PATCH] low-latency zap_page_range

zap_page_range and truncate are the two main latency problems
in the VM/VFS.  The radix-tree-based truncate grinds that into
the dust, but no algorithmic fixes for pagetable takedown have
presented themselves...

Patch from Robert Love.

Attached patch implements a low latency version of "zap_page_range()".

Calls with even moderately large page ranges result in very long lock
held times and consequently very long periods of non-preemptibility.
This function is in my list of the top 3 worst offenders.  It is gross.

This new version reimplements zap_page_range() as a loop over
ZAP_BLOCK_SIZE chunks.  After each iteration, if a reschedule is
pending, we drop page_table_lock and automagically preempt.  Note we can
not blindly drop the locks and reschedule (e.g. for the non-preempt
case) since there is a possibility to enter this codepath holding other
locks.

... I am sure you are familar with all this, its the same deal as your
low-latency work.  This patch implements the "cond_resched_lock()" as we
discussed sometime back.  I think this solution should be acceptable to
you and Linus.

There are other misc. cleanups, too.

This new zap_page_range() yields latency too-low-to-benchmark: <<1ms.
---
 include/linux/sched.h | 28 +++++++++++++++++++++++
 mm/memory.c           | 62 ++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 72 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e6c24f2bfadd..b132cc3952ea 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -956,6 +956,34 @@ static inline void cond_resched(void)
 		__cond_resched();
 }
 
+#ifdef CONFIG_PREEMPT
+
+/*
+ * cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * call schedule, and on return reacquire the lock.
+ *
+ * Note: this does not assume the given lock is the _only_ lock held.
+ * The kernel preemption counter gives us "free" checking that we are
+ * atomic -- let's use it.
+ */
+static inline void cond_resched_lock(spinlock_t * lock)
+{
+	if (need_resched() && preempt_count() == 1) {
+		_raw_spin_unlock(lock);
+		preempt_enable_no_resched();
+		__cond_resched();
+		spin_lock(lock);
+	}
+}
+
+#else
+
+static inline void cond_resched_lock(spinlock_t * lock)
+{
+}
+
+#endif
+
 /* Reevaluate whether the task has signals pending delivery.
    This is required every time the blocked sigset_t changes.
    Athread cathreaders should have t->sigmask_lock.  */
diff --git a/mm/memory.c b/mm/memory.c
index 04f7b9a2ceab..44d44f0b5b25 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -389,8 +389,8 @@ void unmap_page_range(mmu_gather_t *tlb, struct vm_area_struct *vma, unsigned lo
 {
 	pgd_t * dir;
 
-	if (address >= end)
-		BUG();
+	BUG_ON(address >= end);
+
 	dir = pgd_offset(vma->vm_mm, address);
 	tlb_start_vma(tlb, vma);
 	do {
@@ -401,30 +401,56 @@ void unmap_page_range(mmu_gather_t *tlb, struct vm_area_struct *vma, unsigned lo
 	tlb_end_vma(tlb, vma);
 }
 
-/*
- * remove user pages in a given range.
+/* Dispose of an entire mmu_gather_t per rescheduling point */
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
+#define ZAP_BLOCK_SIZE	(FREE_PTE_NR * PAGE_SIZE)
+#endif
+
+/* For UP, 256 pages at a time gives nice low latency */
+#if !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
+#define ZAP_BLOCK_SIZE	(256 * PAGE_SIZE)
+#endif
+
+/* No preempt: go for the best straight-line efficiency */
+#if !defined(CONFIG_PREEMPT)
+#define ZAP_BLOCK_SIZE	(~(0UL))
+#endif
+
+/**
+ * zap_page_range - remove user pages in a given range
+ * @vma: vm_area_struct holding the applicable pages
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
  */
 void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	mmu_gather_t *tlb;
-	unsigned long start = address, end = address + size;
+	unsigned long end, block;
 
-	/*
-	 * This is a long-lived spinlock. That's fine.
-	 * There's no contention, because the page table
-	 * lock only protects against kswapd anyway, and
-	 * even if kswapd happened to be looking at this
-	 * process we _want_ it to get stuck.
-	 */
-	if (address >= end)
-		BUG();
 	spin_lock(&mm->page_table_lock);
-	flush_cache_range(vma, address, end);
 
-	tlb = tlb_gather_mmu(mm, 0);
-	unmap_page_range(tlb, vma, address, end);
-	tlb_finish_mmu(tlb, start, end);
+  	/*
+ 	 * This was once a long-held spinlock.  Now we break the
+ 	 * work up into ZAP_BLOCK_SIZE units and relinquish the
+ 	 * lock after each interation.  This drastically lowers
+ 	 * lock contention and allows for a preemption point.
+  	 */
+	while (size) {
+		block = (size > ZAP_BLOCK_SIZE) ? ZAP_BLOCK_SIZE : size;
+ 		end = address + block;
+ 
+ 		flush_cache_range(vma, address, end);
+ 		tlb = tlb_gather_mmu(mm, 0);
+ 		unmap_page_range(tlb, vma, address, end);
+ 		tlb_finish_mmu(tlb, address, end);
+ 
+ 		cond_resched_lock(&mm->page_table_lock);
+ 
+ 		address += block;
+ 		size -= block;
+ 	}
+
 	spin_unlock(&mm->page_table_lock);
 }
 
-- 
cgit v1.2.3


From fca174ccac67746acd88af0ce8f9f4178ea303b2 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 15 Sep 2002 08:50:24 -0700
Subject: [PATCH] resurrect /proc/meminfo:Buffers

The /proc/meminfo:Buffers statistic is quite useful - it tells us
how effective we are being at caching filesystem metadata.

For example, increases in this figure are a measure of success of the
slablru and buffer_head-limitation patches.

The patch resurrects buffermem accounting.  The metric is calculated
on-demand, via a walk of the blockdev hashtable.
---
 fs/block_dev.c         | 23 +++++++++++++++++++++++
 fs/proc/proc_misc.c    |  4 +++-
 include/linux/blkdev.h |  2 +-
 mm/page_alloc.c        |  3 ++-
 4 files changed, 29 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index f5a3d314bcd4..9b20fd7b9f52 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -332,6 +332,29 @@ struct block_device *bdget(dev_t dev)
 	return bdev;
 }
 
+long nr_blockdev_pages(void)
+{
+	long ret = 0;
+	int i;
+
+	spin_lock(&bdev_lock);
+	for (i = 0; i < ARRAY_SIZE(bdev_hashtable); i++) {
+		struct list_head *head = &bdev_hashtable[i];
+		struct list_head *lh;
+
+		if (head == NULL)
+			continue;
+		list_for_each(lh, head) {
+			struct block_device *bdev;
+
+			bdev = list_entry(lh, struct block_device, bd_hash);
+			ret += bdev->bd_inode->i_mapping->nrpages;
+		}
+	}
+	spin_unlock(&bdev_lock);
+	return ret;
+}
+
 static inline void __bd_forget(struct inode *inode)
 {
 	list_del_init(&inode->i_devices);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 5bf8dbc9d871..8e75f1f9a57c 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -165,6 +165,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		"MemTotal:     %8lu kB\n"
 		"MemFree:      %8lu kB\n"
 		"MemShared:    %8lu kB\n"
+		"Buffers:      %8lu kB\n"
 		"Cached:       %8lu kB\n"
 		"SwapCached:   %8lu kB\n"
 		"Active:       %8lu kB\n"
@@ -185,7 +186,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(i.totalram),
 		K(i.freeram),
 		K(i.sharedram),
-		K(ps.nr_pagecache-swapper_space.nrpages),
+		K(i.bufferram),
+		K(ps.nr_pagecache-swapper_space.nrpages-i.bufferram),
 		K(swapper_space.nrpages),
 		K(active),
 		K(inactive),
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7ef082f9462f..70781c985caa 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -327,7 +327,7 @@ extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bd
 extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
 extern void generic_unplug_device(void *);
-
+extern long nr_blockdev_pages(void);
 
 /*
  * tag stuff
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fdf25f60d93f..7a85123602ad 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -23,6 +23,7 @@
 #include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/pagevec.h>
+#include <linux/blkdev.h>
 
 unsigned long totalram_pages;
 unsigned long totalhigh_pages;
@@ -589,7 +590,7 @@ void si_meminfo(struct sysinfo *val)
 	val->totalram = totalram_pages;
 	val->sharedram = 0;
 	val->freeram = nr_free_pages();
-	val->bufferram = get_page_cache_size();
+	val->bufferram = nr_blockdev_pages();
 #ifdef CONFIG_HIGHMEM
 	val->totalhigh = totalhigh_pages;
 	val->freehigh = nr_free_highpages();
-- 
cgit v1.2.3


From c9d3808fc28fc873dcf0dc95315f644997fde1d0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 15 Sep 2002 08:50:29 -0700
Subject: [PATCH] hugetlb pages

Rohit Seth's ia32 huge tlb pages patch.

Anton Blanchard took a look at this today; he seemed happy
with it and said he could borrow bits.
---
 arch/i386/Config.help       |   9 +
 arch/i386/config.in         |   2 +
 arch/i386/kernel/entry.S    |   4 +-
 arch/i386/kernel/sys_i386.c |  91 ++++++++
 arch/i386/mm/Makefile       |   1 +
 arch/i386/mm/hugetlbpage.c  | 545 ++++++++++++++++++++++++++++++++++++++++++++
 arch/i386/mm/init.c         |  31 +++
 fs/proc/array.c             |  13 +-
 fs/proc/proc_misc.c         |  13 ++
 include/asm-i386/page.h     |   8 +
 include/linux/mm.h          |  15 ++
 include/linux/mmzone.h      |   2 +-
 include/linux/sysctl.h      |   1 +
 kernel/sysctl.c             |  13 ++
 mm/memory.c                 |   8 +
 mm/mmap.c                   |  17 +-
 mm/mprotect.c               |   5 +
 mm/mremap.c                 |   4 +
 18 files changed, 773 insertions(+), 9 deletions(-)
 create mode 100644 arch/i386/mm/hugetlbpage.c

(limited to 'include/linux')

diff --git a/arch/i386/Config.help b/arch/i386/Config.help
index 0c6262860b88..b7715197d507 100644
--- a/arch/i386/Config.help
+++ b/arch/i386/Config.help
@@ -25,6 +25,15 @@ CONFIG_SMP
 
   If you don't know what to do here, say N.
 
+CONFIG_HUGETLB_PAGE
+  This enables support for huge pages.  User space applications
+  can make use of this support with the sys_alloc_hugepages and
+  sys_free_hugepages system calls.  If your applications are
+  huge page aware and your processor (Pentium or later for x86)
+  supports this, then say Y here.
+
+  Otherwise, say N.
+
 CONFIG_PREEMPT
   This option reduces the latency of the kernel when reacting to
   real-time or interactive events by allowing a low priority process to
diff --git a/arch/i386/config.in b/arch/i386/config.in
index 34527954394f..b385d8215b1f 100644
--- a/arch/i386/config.in
+++ b/arch/i386/config.in
@@ -154,6 +154,8 @@ if [ "$CONFIG_MWINCHIP3D" = "y" ]; then
    define_bool CONFIG_X86_OOSTORE y
 fi
 
+bool 'IA-32 Huge TLB Page Support (if available on processor)' CONFIG_HUGETLB_PAGE
+
 bool 'Symmetric multi-processing support' CONFIG_SMP
 bool 'Preemptible Kernel' CONFIG_PREEMPT
 if [ "$CONFIG_SMP" != "y" ]; then
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index ce3fa2791b5f..531e702b5f2b 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -759,8 +759,8 @@ ENTRY(sys_call_table)
 	.long sys_io_getevents
 	.long sys_io_submit
 	.long sys_io_cancel
-	.long sys_ni_syscall	/* 250 */	/* sys_alloc_hugepages */
-	.long sys_ni_syscall			/* sys_free_hugepages */
+	.long sys_alloc_hugepages /* 250 */
+	.long sys_free_hugepages
 	.long sys_exit_group
 
 	.rept NR_syscalls-(.-sys_call_table)/4
diff --git a/arch/i386/kernel/sys_i386.c b/arch/i386/kernel/sys_i386.c
index 15d62d038cef..f7042004ead4 100644
--- a/arch/i386/kernel/sys_i386.c
+++ b/arch/i386/kernel/sys_i386.c
@@ -246,3 +246,94 @@ asmlinkage int sys_olduname(struct oldold_utsname * name)
 
 	return error;
 }
+
+#ifdef CONFIG_HUGETLB_PAGE
+#define HPAGE_ALIGN(x)  (((unsigned long)x + (HPAGE_SIZE -1)) & HPAGE_MASK)
+extern long     sys_munmap(unsigned long, size_t);
+/* get_addr function gets the currently unused virtaul range in
+ * current process's address space.  It returns the LARGE_PAGE_SIZE
+ * aligned address (in cases of success).  Other kernel generic
+ * routines only could gurantee that allocated address is PAGE_SIZSE aligned.
+ */
+static unsigned long
+get_addr(unsigned long addr, unsigned long len)
+{
+	struct vm_area_struct   *vma;
+	if (addr) {
+		addr = HPAGE_ALIGN(addr);
+		vma = find_vma(current->mm, addr);
+		if (((TASK_SIZE - len) >= addr) &&
+				(!vma || addr + len <= vma->vm_start))
+			goto found_addr;
+	}
+	addr = HPAGE_ALIGN(TASK_UNMAPPED_BASE);
+	for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) {
+		if (TASK_SIZE - len < addr)
+			return -ENOMEM;
+		if (!vma || ((addr + len) < vma->vm_start))
+			goto found_addr;
+		addr = vma->vm_end;
+	}
+found_addr:
+	addr = HPAGE_ALIGN(addr);
+	return addr;
+}
+
+asmlinkage unsigned long
+sys_alloc_hugepages(int key, unsigned long addr, unsigned long len, int prot, int flag)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long   raddr;
+	int     retval = 0;
+	extern int alloc_hugetlb_pages(int, unsigned long, unsigned long, int, int);
+	if (!(cpu_has_pse))
+		return -EINVAL;
+	if (key < 0)
+		return -EINVAL;
+	if (len & (HPAGE_SIZE - 1))
+		return -EINVAL;
+	down_write(&mm->mmap_sem);
+	raddr = get_addr(addr, len);
+	if (raddr == -ENOMEM)
+		goto raddr_out;
+	retval = alloc_hugetlb_pages(key, raddr, len, prot, flag);
+
+raddr_out: up_write(&mm->mmap_sem);
+	if (retval < 0)
+		return (unsigned long) retval;
+	return raddr;
+}
+
+asmlinkage int
+sys_free_hugepages(unsigned long addr)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct   *vma;
+	int	retval;
+	extern int free_hugepages(struct vm_area_struct *);
+
+	vma = find_vma(current->mm, addr);
+	if ((!vma) || (!is_vm_hugetlb_page(vma)) || (vma->vm_start!=addr))
+		return -EINVAL;
+	down_write(&mm->mmap_sem);
+	spin_lock(&mm->page_table_lock);
+	retval =  free_hugepages(vma);
+	spin_unlock(&mm->page_table_lock);
+	up_write(&mm->mmap_sem);
+	return retval;
+}
+
+#else
+
+asmlinkage unsigned long
+sys_alloc_hugepages(int key, unsigned long addr, size_t len, int prot, int flag)
+{
+	return -ENOSYS;
+}
+asmlinkage int
+sys_free_hugepages(unsigned long addr)
+{
+	return -ENOSYS;
+}
+
+#endif
diff --git a/arch/i386/mm/Makefile b/arch/i386/mm/Makefile
index 8451fe186c3a..ad32eb72052b 100644
--- a/arch/i386/mm/Makefile
+++ b/arch/i386/mm/Makefile
@@ -12,5 +12,6 @@ O_TARGET := mm.o
 obj-y	 := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o 
 obj-$(CONFIG_DISCONTIGMEM)	+= discontig.o
 export-objs := pageattr.o
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
 
 include $(TOPDIR)/Rules.make
diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
new file mode 100644
index 000000000000..c50cec1dbafb
--- /dev/null
+++ b/arch/i386/mm/hugetlbpage.c
@@ -0,0 +1,545 @@
+/*
+ * IA-32 Huge TLB Page Support for Kernel.
+ *
+ * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+#include <linux/slab.h>
+
+#include <asm/mman.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+
+static struct vm_operations_struct hugetlb_vm_ops;
+struct list_head htlbpage_freelist;
+spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
+extern long htlbpagemem;
+
+void zap_hugetlb_resources(struct vm_area_struct *);
+
+#define MAX_ID 	32
+struct htlbpagekey {
+	struct inode *in;
+	int key;
+} htlbpagek[MAX_ID];
+
+static struct inode *
+find_key_inode(int key)
+{
+	int i;
+
+	for (i = 0; i < MAX_ID; i++) {
+		if (htlbpagek[i].key == key)
+			return (htlbpagek[i].in);
+	}
+	return NULL;
+}
+static struct page *
+alloc_hugetlb_page(void)
+{
+	struct list_head *curr, *head;
+	struct page *page;
+
+	spin_lock(&htlbpage_lock);
+
+	head = &htlbpage_freelist;
+	curr = head->next;
+
+	if (curr == head) {
+		spin_unlock(&htlbpage_lock);
+		return NULL;
+	}
+	page = list_entry(curr, struct page, list);
+	list_del(curr);
+	htlbpagemem--;
+	spin_unlock(&htlbpage_lock);
+	set_page_count(page, 1);
+	memset(page_address(page), 0, HPAGE_SIZE);
+	return page;
+}
+
+static void
+free_hugetlb_page(struct page *page)
+{
+	spin_lock(&htlbpage_lock);
+	if ((page->mapping != NULL) && (page_count(page) == 2)) {
+		struct inode *inode = page->mapping->host;
+		int i;
+
+		ClearPageDirty(page);
+		remove_from_page_cache(page);
+		set_page_count(page, 1);
+		if ((inode->i_size -= HPAGE_SIZE) == 0) {
+			for (i = 0; i < MAX_ID; i++)
+				if (htlbpagek[i].key == inode->i_ino) {
+					htlbpagek[i].key = 0;
+					htlbpagek[i].in = NULL;
+					break;
+				}
+			kfree(inode);
+		}
+	}
+	if (put_page_testzero(page)) {
+		list_add(&page->list, &htlbpage_freelist);
+		htlbpagemem++;
+	}
+	spin_unlock(&htlbpage_lock);
+}
+
+static pte_t *
+huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	pmd_t *pmd = NULL;
+
+	pgd = pgd_offset(mm, addr);
+	pmd = pmd_alloc(mm, pgd, addr);
+	return (pte_t *) pmd;
+}
+
+static pte_t *
+huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	pmd_t *pmd = NULL;
+
+	pgd = pgd_offset(mm, addr);
+	pmd = pmd_offset(pgd, addr);
+	return (pte_t *) pmd;
+}
+
+#define mk_pte_huge(entry) {entry.pte_low |= (_PAGE_PRESENT | _PAGE_PSE);}
+
+static void
+set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+	     struct page *page, pte_t * page_table, int write_access)
+{
+	pte_t entry;
+
+	mm->rss += (HPAGE_SIZE / PAGE_SIZE);
+	if (write_access) {
+		entry =
+		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+	} else
+		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+	entry = pte_mkyoung(entry);
+	mk_pte_huge(entry);
+	set_pte(page_table, entry);
+	return;
+}
+
+static int
+anon_get_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
+	       int write_access, pte_t * page_table)
+{
+	struct page *page;
+
+	page = alloc_hugetlb_page();
+	if (page == NULL)
+		return -1;
+	set_huge_pte(mm, vma, page, page_table, write_access);
+	return 1;
+}
+
+int
+make_hugetlb_pages_present(unsigned long addr, unsigned long end, int flags)
+{
+	int write;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	pte_t *pte;
+
+	vma = find_vma(mm, addr);
+	if (!vma)
+		goto out_error1;
+
+	write = (vma->vm_flags & VM_WRITE) != 0;
+	if ((vma->vm_end - vma->vm_start) & (HPAGE_SIZE - 1))
+		goto out_error1;
+	spin_lock(&mm->page_table_lock);
+	do {
+		pte = huge_pte_alloc(mm, addr);
+		if ((pte) && (pte_none(*pte))) {
+			if (anon_get_hugetlb_page(mm, vma,
+					   write ? VM_WRITE : VM_READ,
+					   pte) == -1)
+				goto out_error;
+		} else
+			goto out_error;
+		addr += HPAGE_SIZE;
+	} while (addr < end);
+	spin_unlock(&mm->page_table_lock);
+	vma->vm_flags |= (VM_HUGETLB | VM_RESERVED);
+	if (flags & MAP_PRIVATE)
+		vma->vm_flags |= VM_DONTCOPY;
+	vma->vm_ops = &hugetlb_vm_ops;
+	return 0;
+out_error:		/* Error case, remove the partial lp_resources. */
+	if (addr > vma->vm_start) {
+		vma->vm_end = addr;
+		zap_hugetlb_resources(vma);
+		vma->vm_end = end;
+	}
+	spin_unlock(&mm->page_table_lock);
+      out_error1:
+	return -1;
+}
+
+int
+copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
+			struct vm_area_struct *vma)
+{
+	pte_t *src_pte, *dst_pte, entry;
+	struct page *ptepage;
+	unsigned long addr = vma->vm_start;
+	unsigned long end = vma->vm_end;
+
+	while (addr < end) {
+		dst_pte = huge_pte_alloc(dst, addr);
+		if (!dst_pte)
+			goto nomem;
+		src_pte = huge_pte_offset(src, addr);
+		entry = *src_pte;
+		ptepage = pte_page(entry);
+		get_page(ptepage);
+		set_pte(dst_pte, entry);
+		dst->rss += (HPAGE_SIZE / PAGE_SIZE);
+		addr += HPAGE_SIZE;
+	}
+	return 0;
+
+nomem:
+	return -ENOMEM;
+}
+
+int
+follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
+		    struct page **pages, struct vm_area_struct **vmas,
+		    unsigned long *st, int *length, int i)
+{
+	pte_t *ptep, pte;
+	unsigned long start = *st;
+	unsigned long pstart;
+	int len = *length;
+	struct page *page;
+
+	do {
+		pstart = start;
+		ptep = huge_pte_offset(mm, start);
+		pte = *ptep;
+
+back1:
+		page = pte_page(pte);
+		if (pages) {
+			page += ((start & ~HPAGE_MASK) >> PAGE_SHIFT);
+			pages[i] = page;
+		}
+		if (vmas)
+			vmas[i] = vma;
+		i++;
+		len--;
+		start += PAGE_SIZE;
+		if (((start & HPAGE_MASK) == pstart) && len &&
+				(start < vma->vm_end))
+			goto back1;
+	} while (len && start < vma->vm_end);
+	*length = len;
+	*st = start;
+	return i;
+}
+
+void
+zap_hugetlb_resources(struct vm_area_struct *mpnt)
+{
+	struct mm_struct *mm = mpnt->vm_mm;
+	unsigned long len, addr, end;
+	pte_t *ptep;
+	struct page *page;
+
+	addr = mpnt->vm_start;
+	end = mpnt->vm_end;
+	len = end - addr;
+	do {
+		ptep = huge_pte_offset(mm, addr);
+		page = pte_page(*ptep);
+		pte_clear(ptep);
+		free_hugetlb_page(page);
+		addr += HPAGE_SIZE;
+	} while (addr < end);
+	mm->rss -= (len >> PAGE_SHIFT);
+	mpnt->vm_ops = NULL;
+	flush_tlb_range(mpnt, end - len, end);
+}
+
+static void
+unlink_vma(struct vm_area_struct *mpnt)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+
+	vma = mm->mmap;
+	if (vma == mpnt) {
+		mm->mmap = vma->vm_next;
+	} else {
+		while (vma->vm_next != mpnt) {
+			vma = vma->vm_next;
+		}
+		vma->vm_next = mpnt->vm_next;
+	}
+	rb_erase(&mpnt->vm_rb, &mm->mm_rb);
+	mm->mmap_cache = NULL;
+	mm->map_count--;
+}
+
+int
+free_hugepages(struct vm_area_struct *mpnt)
+{
+	unlink_vma(mpnt);
+	zap_hugetlb_resources(mpnt);
+	kmem_cache_free(vm_area_cachep, mpnt);
+	return 1;
+}
+
+static struct inode *
+set_new_inode(unsigned long len, int prot, int flag, int key)
+{
+	struct inode *inode;
+	int i;
+
+	for (i = 0; i < MAX_ID; i++) {
+		if (htlbpagek[i].key == 0)
+			break;
+	}
+	if (i == MAX_ID)
+		return NULL;
+	inode = kmalloc(sizeof (struct inode), GFP_KERNEL);
+	if (inode == NULL)
+		return NULL;
+
+	inode_init_once(inode);
+	atomic_inc(&inode->i_writecount);
+	inode->i_mapping = &inode->i_data;
+	inode->i_mapping->host = inode;
+	inode->i_ino = (unsigned long)key;
+
+	htlbpagek[i].key = key;
+	htlbpagek[i].in = inode;
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+	inode->i_mode = prot;
+	inode->i_size = len;
+	return inode;
+}
+
+static int
+check_size_prot(struct inode *inode, unsigned long len, int prot, int flag)
+{
+	if (inode->i_uid != current->fsuid)
+		return -1;
+	if (inode->i_gid != current->fsgid)
+		return -1;
+	if (inode->i_size != len)
+		return -1;
+	return 0;
+}
+
+static int
+alloc_shared_hugetlb_pages(int key, unsigned long addr, unsigned long len,
+			   int prot, int flag)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	struct inode *inode;
+	struct address_space *mapping;
+	struct page *page;
+	int idx;
+	int retval = -ENOMEM;
+	int newalloc = 0;
+
+try_again:
+	spin_lock(&htlbpage_lock);
+
+	inode = find_key_inode(key);
+	if (inode == NULL) {
+		if (!capable(CAP_SYS_ADMIN)) {
+			if (!in_group_p(0)) {
+				retval = -EPERM;
+				goto out_err;
+			}
+		}
+		if (!(flag & IPC_CREAT)) {
+			retval = -ENOENT;
+			goto out_err;
+		}
+		inode = set_new_inode(len, prot, flag, key);
+		if (inode == NULL)
+			goto out_err;
+		newalloc = 1;
+	} else {
+		if (check_size_prot(inode, len, prot, flag) < 0) {
+			retval = -EINVAL;
+			goto out_err;
+		}
+		else if (atomic_read(&inode->i_writecount)) {
+			spin_unlock(&htlbpage_lock);
+			goto try_again;
+		}
+	}
+	spin_unlock(&htlbpage_lock);
+	mapping = inode->i_mapping;
+
+	addr = do_mmap_pgoff(NULL, addr, len, (unsigned long) prot,
+			MAP_NORESERVE|MAP_FIXED|MAP_PRIVATE|MAP_ANONYMOUS, 0);
+	if (IS_ERR((void *) addr))
+		goto freeinode;
+
+	vma = find_vma(mm, addr);
+	if (!vma) {
+		retval = -EINVAL;
+		goto freeinode;
+	}
+
+	spin_lock(&mm->page_table_lock);
+	do {
+		pte_t *pte = huge_pte_alloc(mm, addr);
+		if ((pte) && (pte_none(*pte))) {
+			idx = (addr - vma->vm_start) >> HPAGE_SHIFT;
+			page = find_get_page(mapping, idx);
+			if (page == NULL) {
+				page = alloc_hugetlb_page();
+				if (page == NULL)
+					goto out;
+				add_to_page_cache(page, mapping, idx);
+			}
+			set_huge_pte(mm, vma, page, pte,
+				     (vma->vm_flags & VM_WRITE));
+		} else
+			goto out;
+		addr += HPAGE_SIZE;
+	} while (addr < vma->vm_end);
+	retval = 0;
+	vma->vm_flags |= (VM_HUGETLB | VM_RESERVED);
+	vma->vm_ops = &hugetlb_vm_ops;
+	spin_unlock(&mm->page_table_lock);
+	spin_lock(&htlbpage_lock);
+	atomic_set(&inode->i_writecount, 0);
+	spin_unlock(&htlbpage_lock);
+	return retval;
+out:
+	if (addr > vma->vm_start) {
+		unsigned long raddr;
+		raddr = vma->vm_end;
+		vma->vm_end = addr;
+		zap_hugetlb_resources(vma);
+		vma->vm_end = raddr;
+	}
+	spin_unlock(&mm->page_table_lock);
+	do_munmap(mm, vma->vm_start, len);
+	if (newalloc)
+		goto freeinode;
+	return retval;
+out_err: spin_unlock(&htlbpage_lock);
+freeinode:
+	 if (newalloc) {
+		 for(idx=0;idx<MAX_ID;idx++)
+			 if (htlbpagek[idx].key == inode->i_ino) {
+				 htlbpagek[idx].key = 0;
+				 htlbpagek[idx].in = NULL;
+				 break;
+			 }
+		 kfree(inode);
+	 }
+	 return retval;
+}
+
+static int
+alloc_private_hugetlb_pages(int key, unsigned long addr, unsigned long len,
+			    int prot, int flag)
+{
+	if (!capable(CAP_SYS_ADMIN)) {
+		if (!in_group_p(0))
+			return -EPERM;
+	}
+	addr = do_mmap_pgoff(NULL, addr, len, prot,
+			MAP_NORESERVE|MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, 0);
+	if (IS_ERR((void *) addr))
+		return -ENOMEM;
+	if (make_hugetlb_pages_present(addr, (addr + len), flag) < 0) {
+		do_munmap(current->mm, addr, len);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+int
+alloc_hugetlb_pages(int key, unsigned long addr, unsigned long len, int prot,
+		    int flag)
+{
+	if (key > 0)
+		return alloc_shared_hugetlb_pages(key, addr, len, prot, flag);
+	return alloc_private_hugetlb_pages(key, addr, len, prot, flag);
+}
+
+int
+set_hugetlb_mem_size(int count)
+{
+	int j, lcount;
+	struct page *page, *map;
+	extern long htlbzone_pages;
+	extern struct list_head htlbpage_freelist;
+
+	if (count < 0)
+		lcount = count;
+	else
+		lcount = count - htlbzone_pages;
+
+	if (lcount > 0) {	/* Increase the mem size. */
+		while (lcount--) {
+			page = alloc_pages(GFP_ATOMIC, HUGETLB_PAGE_ORDER);
+			if (page == NULL)
+				break;
+			map = page;
+			for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
+				SetPageReserved(map);
+				map++;
+			}
+			spin_lock(&htlbpage_lock);
+			list_add(&page->list, &htlbpage_freelist);
+			htlbpagemem++;
+			htlbzone_pages++;
+			spin_unlock(&htlbpage_lock);
+		}
+		return (int) htlbzone_pages;
+	}
+	/* Shrink the memory size. */
+	while (lcount++) {
+		page = alloc_hugetlb_page();
+		if (page == NULL)
+			break;
+		spin_lock(&htlbpage_lock);
+		htlbzone_pages--;
+		spin_unlock(&htlbpage_lock);
+		map = page;
+		for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
+			map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
+					1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
+					1 << PG_private | 1<< PG_writeback);
+			set_page_count(page, 0);
+			map++;
+		}
+		set_page_count(page, 1);
+		__free_pages(page, HUGETLB_PAGE_ORDER);
+	}
+	return (int) htlbzone_pages;
+}
+
+static struct vm_operations_struct hugetlb_vm_ops = {
+	.close	= zap_hugetlb_resources,
+};
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 3bf5f68e0bac..0d026c67522c 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -431,6 +431,13 @@ static void __init set_max_mapnr_init(void)
 extern void set_max_mapnr_init(void);
 #endif /* !CONFIG_DISCONTIGMEM */
 
+#ifdef CONFIG_HUGETLB_PAGE
+long    htlbpagemem = 0;
+int     htlbpage_max;
+long    htlbzone_pages;
+extern struct list_head htlbpage_freelist;
+#endif
+
 void __init mem_init(void)
 {
 	extern int ppro_with_ram_bug(void);
@@ -493,6 +500,30 @@ void __init mem_init(void)
 #ifndef CONFIG_SMP
 	zap_low_mappings();
 #endif
+#ifdef CONFIG_HUGETLB_PAGE
+	{
+		long	i, j;
+		struct	page	*page, *map;
+		/*For now reserve quarter for hugetlb_pages.*/
+		htlbzone_pages = (max_low_pfn >> ((HPAGE_SHIFT - PAGE_SHIFT) + 2)) ;
+		/*Will make this kernel command line. */
+		INIT_LIST_HEAD(&htlbpage_freelist);
+		for (i=0; i<htlbzone_pages; i++) {
+			page = alloc_pages(GFP_ATOMIC, HUGETLB_PAGE_ORDER);
+			if (page == NULL)
+				break;
+			map = page;
+			for (j=0; j<(HPAGE_SIZE/PAGE_SIZE); j++) {
+				SetPageReserved(map);
+				map++;
+			}
+			list_add(&page->list, &htlbpage_freelist);
+		}
+		printk("Total Huge_TLB_Page memory pages allocated %ld\n", i);
+		htlbzone_pages = htlbpagemem = i;
+		htlbpage_max = i;
+	}
+#endif
 }
 
 #if CONFIG_X86_PAE
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 0084e27fe7a8..feb2cbab4699 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -487,7 +487,18 @@ int proc_pid_statm(struct task_struct *task, char * buffer)
 		while (vma) {
 			pgd_t *pgd = pgd_offset(mm, vma->vm_start);
 			int pages = 0, shared = 0, dirty = 0, total = 0;
-
+			if (is_vm_hugetlb_page(vma)) {
+				int num_pages = ((vma->vm_end - vma->vm_start)/PAGE_SIZE);
+
+				resident += num_pages;
+				if (!(vma->vm_flags & VM_DONTCOPY))
+					share += num_pages;
+				if (vma->vm_flags & VM_WRITE)
+					dt += num_pages;
+				drs += num_pages;
+				vma = vma->vm_next;
+				continue;
+			}
 			statm_pgd_range(pgd, vma->vm_start, vma->vm_end, &pages, &shared, &dirty, &total);
 			resident += pages;
 			share += shared;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 8e75f1f9a57c..9c37ed85a372 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -206,6 +206,19 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		non_flushes
 		);
 
+#ifdef CONFIG_HUGETLB_PAGE
+	{
+		extern unsigned long htlbpagemem, htlbzone_pages;
+		len += sprintf(page + len,
+				"HugePages:    %8lu\n"
+				"Available:    %8lu\n"
+				"Size:         %8lu kB\n",
+				htlbzone_pages,
+				htlbpagemem,
+				HPAGE_SIZE/1024);
+	}
+
+#endif
 	return proc_calc_metrics(page, start, off, count, eof, len);
 #undef K
 }
diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h
index a6fe2daa59a1..5a09fd4b72f1 100644
--- a/include/asm-i386/page.h
+++ b/include/asm-i386/page.h
@@ -44,14 +44,22 @@ typedef struct { unsigned long pte_low, pte_high; } pte_t;
 typedef struct { unsigned long long pmd; } pmd_t;
 typedef struct { unsigned long long pgd; } pgd_t;
 #define pte_val(x)	((x).pte_low | ((unsigned long long)(x).pte_high << 32))
+#define HPAGE_SHIFT	21
 #else
 typedef struct { unsigned long pte_low; } pte_t;
 typedef struct { unsigned long pmd; } pmd_t;
 typedef struct { unsigned long pgd; } pgd_t;
 #define pte_val(x)	((x).pte_low)
+#define HPAGE_SHIFT	22
 #endif
 #define PTE_MASK	PAGE_MASK
 
+#ifdef CONFIG_HUGETLB_PAGE
+#define HPAGE_SIZE	((1UL) << HPAGE_SHIFT)
+#define HPAGE_MASK	(~(HPAGE_SIZE - 1))
+#define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
+#endif
+
 typedef struct { unsigned long pgprot; } pgprot_t;
 
 #define pmd_val(x)	((x).pmd)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4dfac9d2cb5c..c90392074cc2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -104,6 +104,7 @@ struct vm_area_struct {
 #define VM_DONTEXPAND	0x00040000	/* Cannot expand with mremap() */
 #define VM_RESERVED	0x00080000	/* Don't unmap it from swap_out */
 #define VM_ACCOUNT	0x00100000	/* Is a VM accounted object */
+#define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
 
 #define VM_STACK_FLAGS	(0x00000100 | VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT)
 
@@ -377,6 +378,20 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long
 int __set_page_dirty_buffers(struct page *page);
 int __set_page_dirty_nobuffers(struct page *page);
 
+#ifdef CONFIG_HUGETLB_PAGE
+#define is_vm_hugetlb_page(vma) (vma->vm_flags & VM_HUGETLB)
+extern int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
+extern int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int);
+extern	int free_hugepages(struct vm_area_struct *);
+
+#else
+#define is_vm_hugetlb_page(vma) (0)
+#define follow_hugetlb_page(mm, vma, pages, vmas, start, len, i) (0)
+#define copy_hugetlb_page_range(dst, src, vma) (0)
+#define free_hugepages(mpnt)  do { } while(0)
+#endif
+
+
 /*
  * If the mapping doesn't provide a set_page_dirty a_op, then
  * just fall through and assume that it wants buffer_heads.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e5b6bc1111b8..b271e144972a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -16,7 +16,7 @@
  */
 
 #ifndef CONFIG_FORCE_MAX_ZONEORDER
-#define MAX_ORDER 10
+#define MAX_ORDER 11
 #else
 #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
 #endif
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 4856854660cc..527a65938ade 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -128,6 +128,7 @@ enum
 	KERN_TAINTED=53,	/* int: various kernel tainted flags */
 	KERN_CADPID=54,		/* int: PID of the process to notify on CAD */
 	KERN_PIDMAX=55,		/* int: PID # limit */
+	KERN_HUGETLB_PAGE_NUM=56, /* int: Number of available Huge Pages */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f678f46e7a8f..6f92068e3f29 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -98,6 +98,11 @@ int proc_dol2crvec(ctl_table *table, int write, struct file *filp,
 extern int acct_parm[];
 #endif
 
+#ifdef CONFIG_HUGETLB_PAGE
+extern	int htlbpage_max;
+extern	int	set_hugetlb_mem_size(int);
+#endif
+
 static int parse_table(int *, int, void *, size_t *, void *, size_t,
 		       ctl_table *, void **);
 static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
@@ -258,6 +263,10 @@ static ctl_table kern_table[] = {
 #endif
 	{KERN_PIDMAX, "pid_max", &pid_max, sizeof (int),
 	 0600, NULL, &proc_dointvec},
+#ifdef CONFIG_HUGETLB_PAGE
+	 {KERN_HUGETLB_PAGE_NUM, "numhugepages", &htlbpage_max, sizeof(int), 0644, NULL, 
+	  &proc_dointvec},
+#endif
 	{0}
 };
 
@@ -897,6 +906,10 @@ static int do_proc_dointvec(ctl_table *table, int write, struct file *filp,
 				val = -val;
 			buffer += len;
 			left -= len;
+#ifdef CONFIG_HUGETLB_PAGE
+			if (i == &htlbpage_max)
+				val = set_hugetlb_mem_size(val);
+#endif
 			switch(op) {
 			case OP_SET:	*i = val; break;
 			case OP_AND:	*i &= val; break;
diff --git a/mm/memory.c b/mm/memory.c
index 44d44f0b5b25..c886e849231b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -208,6 +208,9 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 	unsigned long end = vma->vm_end;
 	unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 
+	if (is_vm_hugetlb_page(vma))
+		return copy_hugetlb_page_range(dst, src, vma);
+
 	src_pgd = pgd_offset(src, address)-1;
 	dst_pgd = pgd_offset(dst, address)-1;
 
@@ -530,6 +533,11 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				|| !(flags & vma->vm_flags))
 			return i ? : -EFAULT;
 
+		if (is_vm_hugetlb_page(vma)) {
+			i = follow_hugetlb_page(mm, vma, pages, vmas,
+						&start, &len, i);
+			continue;
+		}
 		spin_lock(&mm->page_table_lock);
 		do {
 			struct page *map;
diff --git a/mm/mmap.c b/mm/mmap.c
index 4e39ff6d91ff..92b179a8b3ab 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1031,10 +1031,14 @@ static struct vm_area_struct *touched_by_munmap(struct mm_struct *mm,
 	touched = NULL;
 	do {
 		struct vm_area_struct *next = mpnt->vm_next;
-		mpnt->vm_next = touched;
-		touched = mpnt;
-		mm->map_count--;
-		rb_erase(&mpnt->vm_rb, &mm->mm_rb);
+		if (!(is_vm_hugetlb_page(mpnt))) {
+			mpnt->vm_next = touched;
+			touched = mpnt;
+			rb_erase(&mpnt->vm_rb, &mm->mm_rb);
+			mm->map_count--;
+		}
+		else
+			free_hugepages(mpnt);
 		mpnt = next;
 	} while (mpnt && mpnt->vm_start < end);
 	*npp = mpnt;
@@ -1273,7 +1277,10 @@ void exit_mmap(struct mm_struct * mm)
 			vm_unacct_memory((end - start) >> PAGE_SHIFT);
 
 		mm->map_count--;
-		unmap_page_range(tlb, mpnt, start, end);
+		if (!(is_vm_hugetlb_page(mpnt)))
+			unmap_page_range(tlb, mpnt, start, end);
+		else
+			mpnt->vm_ops->close(mpnt);
 		mpnt = mpnt->vm_next;
 	}
 
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 19733cc75eae..d2c72d8a5fd6 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -321,6 +321,11 @@ asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot
 
 		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
 
+		if (is_vm_hugetlb_page(vma)) {
+			error = -EACCES;
+			goto out;
+		}
+
 		newflags = prot | (vma->vm_flags & ~(PROT_READ | PROT_WRITE | PROT_EXEC));
 		if ((newflags & ~(newflags >> 4)) & 0xf) {
 			error = -EACCES;
diff --git a/mm/mremap.c b/mm/mremap.c
index b7cdce69ad01..0d22f3d6c20f 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -311,6 +311,10 @@ unsigned long do_mremap(unsigned long addr,
 	vma = find_vma(current->mm, addr);
 	if (!vma || vma->vm_start > addr)
 		goto out;
+	if (is_vm_hugetlb_page(vma)) {
+		ret = -EINVAL;
+		goto out;
+	}
 	/* We can't remap across vm area boundaries */
 	if (old_len > vma->vm_end - addr)
 		goto out;
-- 
cgit v1.2.3


From 73960360a518d0031fb66a4d5140665e9ea552f3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 15 Sep 2002 08:50:39 -0700
Subject: [PATCH] add /proc/meminfo:Mapped

The patch adds a "Mapped" field to /proc/meminfo - tha amount of memory
which is mapped into pagetables.

This is a useful statistic to monitor when testing and observing the
vitual memory system.
---
 fs/proc/proc_misc.c        | 2 ++
 include/linux/page-flags.h | 1 +
 mm/page_alloc.c            | 1 +
 mm/rmap.c                  | 5 +++++
 4 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 9c37ed85a372..7289526aaace 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -178,6 +178,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		"SwapFree:     %8lu kB\n"
 		"Dirty:        %8lu kB\n"
 		"Writeback:    %8lu kB\n"
+		"Mapped:       %8lu kB\n"
 		"Committed_AS: %8u kB\n"
 		"PageTables:   %8lu kB\n"
 		"ReverseMaps:  %8lu\n"
@@ -199,6 +200,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 		K(i.freeswap),
 		K(ps.nr_dirty),
 		K(ps.nr_writeback),
+		K(ps.nr_mapped),
 		K(committed),
 		K(ps.nr_page_table_pages),
 		ps.nr_reverse_maps,
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 42d1853294ac..6475e261f609 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -78,6 +78,7 @@ extern struct page_state {
 	unsigned long nr_pagecache;
 	unsigned long nr_page_table_pages;
 	unsigned long nr_reverse_maps;
+	unsigned long nr_mapped;
 } ____cacheline_aligned_in_smp page_states[NR_CPUS];
 
 extern void get_page_state(struct page_state *ret);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7a85123602ad..a54548bbf27b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -562,6 +562,7 @@ void get_page_state(struct page_state *ret)
 		ret->nr_pagecache += ps->nr_pagecache;
 		ret->nr_page_table_pages += ps->nr_page_table_pages;
 		ret->nr_reverse_maps += ps->nr_reverse_maps;
+		ret->nr_mapped += ps->nr_mapped;
 	}
 }
 
diff --git a/mm/rmap.c b/mm/rmap.c
index f8e34678004b..41c2c8837fc2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -214,6 +214,7 @@ void page_add_rmap(struct page * page, pte_t * ptep)
 	if (page->pte.direct == 0) {
 		page->pte.direct = pte_paddr;
 		SetPageDirect(page);
+		inc_page_state(nr_mapped);
 		goto out;
 	}
 
@@ -336,6 +337,8 @@ void page_remove_rmap(struct page * page, pte_t * ptep)
 
 out:
 	pte_chain_unlock(page);
+	if (!page_mapped(page))
+		dec_page_state(nr_mapped);
 	return;
 }
 
@@ -501,6 +504,8 @@ int try_to_unmap(struct page * page)
 		}
 	}
 out:
+	if (!page_mapped(page))
+		dec_page_state(nr_mapped);
 	return ret;
 }
 
-- 
cgit v1.2.3


From db74867540721876f23aa0e0a7c2f3d6c900c5bc Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 15 Sep 2002 08:50:44 -0700
Subject: [PATCH] ext3 ceanup: use EXT3_SB

Patch from Jani Monoses <jani@iv.ro>

"This turns the remaining parts of ext3 to EXT3_SB and turns the
 latter from a macro to inline function which returns the generic_sbp
 field of u.

 linux/fs.h is not touched by this patch though.

 Intermezzo's three uses of ext3_sb are also not changed."
---
 fs/ext3/balloc.c        | 60 ++++++++++++++++++++++++-------------------------
 fs/ext3/dir.c           |  2 +-
 fs/ext3/ialloc.c        | 46 ++++++++++++++++++-------------------
 fs/ext3/inode.c         | 14 ++++++------
 fs/ext3/ioctl.c         |  6 ++---
 fs/ext3/namei.c         |  6 ++---
 fs/ext3/super.c         | 31 +++++++++++++++----------
 include/linux/ext3_fs.h | 25 ++++++++++++---------
 8 files changed, 100 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index f733c4b50686..c6760c7b42dc 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -46,18 +46,18 @@ struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
 	unsigned long desc;
 	struct ext3_group_desc * gdp;
 
-	if (block_group >= sb->u.ext3_sb.s_groups_count) {
+	if (block_group >= EXT3_SB(sb)->s_groups_count) {
 		ext3_error (sb, "ext3_get_group_desc",
 			    "block_group >= groups_count - "
 			    "block_group = %d, groups_count = %lu",
-			    block_group, sb->u.ext3_sb.s_groups_count);
+			    block_group, EXT3_SB(sb)->s_groups_count);
 
 		return NULL;
 	}
 	
 	group_desc = block_group / EXT3_DESC_PER_BLOCK(sb);
 	desc = block_group % EXT3_DESC_PER_BLOCK(sb);
-	if (!sb->u.ext3_sb.s_group_desc[group_desc]) {
+	if (!EXT3_SB(sb)->s_group_desc[group_desc]) {
 		ext3_error (sb, "ext3_get_group_desc",
 			    "Group descriptor not loaded - "
 			    "block_group = %d, group_desc = %lu, desc = %lu",
@@ -66,9 +66,9 @@ struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
 	}
 	
 	gdp = (struct ext3_group_desc *) 
-	      sb->u.ext3_sb.s_group_desc[group_desc]->b_data;
+	      EXT3_SB(sb)->s_group_desc[group_desc]->b_data;
 	if (bh)
-		*bh = sb->u.ext3_sb.s_group_desc[group_desc];
+		*bh = EXT3_SB(sb)->s_group_desc[group_desc];
 	return gdp + desc;
 }
 
@@ -119,7 +119,7 @@ void ext3_free_blocks (handle_t *handle, struct inode * inode,
 		return;
 	}
 	lock_super (sb);
-	es = sb->u.ext3_sb.s_es;
+	es = EXT3_SB(sb)->s_es;
 	if (block < le32_to_cpu(es->s_first_data_block) || 
 	    (block + count) > le32_to_cpu(es->s_blocks_count)) {
 		ext3_error (sb, "ext3_free_blocks",
@@ -155,9 +155,9 @@ do_more:
 	if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) ||
 	    in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) ||
 	    in_range (block, le32_to_cpu(gdp->bg_inode_table),
-		      sb->u.ext3_sb.s_itb_per_group) ||
+		      EXT3_SB(sb)->s_itb_per_group) ||
 	    in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table),
-		      sb->u.ext3_sb.s_itb_per_group))
+		      EXT3_SB(sb)->s_itb_per_group))
 		ext3_error (sb, "ext3_free_blocks",
 			    "Freeing blocks in system zones - "
 			    "Block = %lu, count = %lu",
@@ -183,8 +183,8 @@ do_more:
 	if (err)
 		goto error_return;
 
-	BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
-	err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
+	BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access");
+	err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
 	if (err)
 		goto error_return;
 
@@ -253,8 +253,8 @@ do_more:
 	if (!err) err = ret;
 
 	/* And the superblock */
-	BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "dirtied superblock");
-	ret = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
+	BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "dirtied superblock");
+	ret = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
 	if (!err) err = ret;
 
 	if (overflow && !err) {
@@ -408,12 +408,12 @@ ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal,
 	}
 
 	lock_super(sb);
-	es = sb->u.ext3_sb.s_es;
+	es = EXT3_SB(sb)->s_es;
 	if (le32_to_cpu(es->s_free_blocks_count) <=
 			le32_to_cpu(es->s_r_blocks_count) &&
-	    ((sb->u.ext3_sb.s_resuid != current->fsuid) &&
-	     (sb->u.ext3_sb.s_resgid == 0 ||
-	      !in_group_p(sb->u.ext3_sb.s_resgid)) && 
+	    ((EXT3_SB(sb)->s_resuid != current->fsuid) &&
+	     (EXT3_SB(sb)->s_resgid == 0 ||
+	      !in_group_p(EXT3_SB(sb)->s_resgid)) && 
 	     !capable(CAP_SYS_RESOURCE)))
 		goto out;
 
@@ -464,9 +464,9 @@ ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal,
 	 * Now search the rest of the groups.  We assume that 
 	 * i and gdp correctly point to the last group visited.
 	 */
-	for (bit = 0; bit < sb->u.ext3_sb.s_groups_count; bit++) {
+	for (bit = 0; bit < EXT3_SB(sb)->s_groups_count; bit++) {
 		group_no++;
-		if (group_no >= sb->u.ext3_sb.s_groups_count)
+		if (group_no >= EXT3_SB(sb)->s_groups_count)
 			group_no = 0;
 		gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
 		if (!gdp) {
@@ -518,8 +518,8 @@ got_block:
 	if (fatal)
 		goto out;
 
-	BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
-	fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
+	BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access");
+	fatal = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
 	if (fatal)
 		goto out;
 
@@ -529,7 +529,7 @@ got_block:
 	if (target_block == le32_to_cpu(gdp->bg_block_bitmap) ||
 	    target_block == le32_to_cpu(gdp->bg_inode_bitmap) ||
 	    in_range(target_block, le32_to_cpu(gdp->bg_inode_table),
-		      sb->u.ext3_sb.s_itb_per_group))
+		      EXT3_SB(sb)->s_itb_per_group))
 		ext3_error(sb, "ext3_new_block",
 			    "Allocating block in system zone - "
 			    "block = %u", target_block);
@@ -594,9 +594,9 @@ got_block:
 	if (!fatal)
 		fatal = err;
 
-	BUFFER_TRACE(sb->u.ext3_sb.s_sbh,
+	BUFFER_TRACE(EXT3_SB(sb)->s_sbh,
 			"journal_dirty_metadata for superblock");
-	err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
+	err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
 	if (!fatal)
 		fatal = err;
 
@@ -637,11 +637,11 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
 	int i;
 	
 	lock_super(sb);
-	es = sb->u.ext3_sb.s_es;
+	es = EXT3_SB(sb)->s_es;
 	desc_count = 0;
 	bitmap_count = 0;
 	gdp = NULL;
-	for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
+	for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
 		gdp = ext3_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
@@ -662,7 +662,7 @@ unsigned long ext3_count_free_blocks(struct super_block *sb)
 	unlock_super(sb);
 	return bitmap_count;
 #else
-	return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_blocks_count);
+	return le32_to_cpu(EXT3_SB(sb)->s_es->s_free_blocks_count);
 #endif
 }
 
@@ -671,7 +671,7 @@ static inline int block_in_use(unsigned long block,
 				unsigned char * map)
 {
 	return ext3_test_bit ((block -
-		le32_to_cpu(sb->u.ext3_sb.s_es->s_first_data_block)) %
+		le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) %
 			 EXT3_BLOCKS_PER_GROUP(sb), map);
 }
 
@@ -738,11 +738,11 @@ void ext3_check_blocks_bitmap (struct super_block * sb)
 	struct ext3_group_desc *gdp;
 	int i;
 
-	es = sb->u.ext3_sb.s_es;
+	es = EXT3_SB(sb)->s_es;
 	desc_count = 0;
 	bitmap_count = 0;
 	gdp = NULL;
-	for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
+	for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
 		gdp = ext3_get_group_desc (sb, i, NULL);
 		if (!gdp)
 			continue;
@@ -776,7 +776,7 @@ void ext3_check_blocks_bitmap (struct super_block * sb)
 				    "Inode bitmap for group %d is marked free",
 				    i);
 
-		for (j = 0; j < sb->u.ext3_sb.s_itb_per_group; j++)
+		for (j = 0; j < EXT3_SB(sb)->s_itb_per_group; j++)
 			if (!block_in_use (le32_to_cpu(gdp->bg_inode_table) + j,
 							sb, bitmap_bh->b_data))
 				ext3_error (sb, "ext3_check_blocks_bitmap",
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index b05b851429e6..0607c641066b 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -54,7 +54,7 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
 	else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
 		error_msg = "directory entry across blocks";
 	else if (le32_to_cpu(de->inode) >
-			le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count))
+			le32_to_cpu(EXT3_SB(dir->i_sb)->s_es->s_inodes_count))
 		error_msg = "inode out of bounds";
 
 	if (error_msg != NULL)
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 3cf6c555a1c9..08cb455efbc6 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -127,7 +127,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
 	clear_inode (inode);
 
 	lock_super (sb);
-	es = sb->u.ext3_sb.s_es;
+	es = EXT3_SB(sb)->s_es;
 	if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
 		ext3_error (sb, "ext3_free_inode",
 			    "reserved or nonexistent inode %lu", ino);
@@ -155,8 +155,8 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
 		fatal = ext3_journal_get_write_access(handle, bh2);
 		if (fatal) goto error_return;
 
-		BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get write access");
-		fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
+		BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get write access");
+		fatal = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
 		if (fatal) goto error_return;
 
 		if (gdp) {
@@ -171,9 +171,9 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
 		if (!fatal) fatal = err;
 		es->s_free_inodes_count =
 			cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) + 1);
-		BUFFER_TRACE(sb->u.ext3_sb.s_sbh,
+		BUFFER_TRACE(EXT3_SB(sb)->s_sbh,
 					"call ext3_journal_dirty_metadata");
-		err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
+		err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
 		if (!fatal) fatal = err;
 	}
 	BUFFER_TRACE(bitmap_bh, "call ext3_journal_dirty_metadata");
@@ -222,16 +222,16 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
 	ei = EXT3_I(inode);
 
 	lock_super (sb);
-	es = sb->u.ext3_sb.s_es;
+	es = EXT3_SB(sb)->s_es;
 repeat:
 	gdp = NULL;
 	i = 0;
 
 	if (S_ISDIR(mode)) {
 		avefreei = le32_to_cpu(es->s_free_inodes_count) /
-			sb->u.ext3_sb.s_groups_count;
+			EXT3_SB(sb)->s_groups_count;
 		if (!gdp) {
-			for (j = 0; j < sb->u.ext3_sb.s_groups_count; j++) {
+			for (j = 0; j < EXT3_SB(sb)->s_groups_count; j++) {
 				struct buffer_head *temp_buffer;
 				tmp = ext3_get_group_desc (sb, j, &temp_buffer);
 				if (tmp &&
@@ -261,10 +261,10 @@ repeat:
 			 * Use a quadratic hash to find a group with a
 			 * free inode
 			 */
-			for (j = 1; j < sb->u.ext3_sb.s_groups_count; j <<= 1) {
+			for (j = 1; j < EXT3_SB(sb)->s_groups_count; j <<= 1) {
 				i += j;
-				if (i >= sb->u.ext3_sb.s_groups_count)
-					i -= sb->u.ext3_sb.s_groups_count;
+				if (i >= EXT3_SB(sb)->s_groups_count)
+					i -= EXT3_SB(sb)->s_groups_count;
 				tmp = ext3_get_group_desc (sb, i, &bh2);
 				if (tmp &&
 				    le16_to_cpu(tmp->bg_free_inodes_count)) {
@@ -278,8 +278,8 @@ repeat:
 			 * That failed: try linear search for a free inode
 			 */
 			i = EXT3_I(dir)->i_block_group + 1;
-			for (j = 2; j < sb->u.ext3_sb.s_groups_count; j++) {
-				if (++i >= sb->u.ext3_sb.s_groups_count)
+			for (j = 2; j < EXT3_SB(sb)->s_groups_count; j++) {
+				if (++i >= EXT3_SB(sb)->s_groups_count)
 					i = 0;
 				tmp = ext3_get_group_desc (sb, i, &bh2);
 				if (tmp &&
@@ -357,13 +357,13 @@ repeat:
 	err = ext3_journal_dirty_metadata(handle, bh2);
 	if (err) goto fail;
 	
-	BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
-	err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
+	BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access");
+	err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
 	if (err) goto fail;
 	es->s_free_inodes_count =
 		cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1);
-	BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "call ext3_journal_dirty_metadata");
-	err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
+	BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "call ext3_journal_dirty_metadata");
+	err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
 	sb->s_dirt = 1;
 	if (err) goto fail;
 
@@ -417,7 +417,7 @@ repeat:
 	if (IS_DIRSYNC(inode))
 		handle->h_sync = 1;
 	insert_inode_hash(inode);
-	inode->i_generation = sb->u.ext3_sb.s_next_generation++;
+	inode->i_generation = EXT3_SB(sb)->s_next_generation++;
 
 	ei->i_state = EXT3_STATE_NEW;
 	err = ext3_mark_inode_dirty(handle, inode);
@@ -512,11 +512,11 @@ unsigned long ext3_count_free_inodes (struct super_block * sb)
 	int i;
 
 	lock_super (sb);
-	es = sb->u.ext3_sb.s_es;
+	es = EXT3_SB(sb)->s_es;
 	desc_count = 0;
 	bitmap_count = 0;
 	gdp = NULL;
-	for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
+	for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
 		gdp = ext3_get_group_desc (sb, i, NULL);
 		if (!gdp)
 			continue;
@@ -537,7 +537,7 @@ unsigned long ext3_count_free_inodes (struct super_block * sb)
 	unlock_super(sb);
 	return desc_count;
 #else
-	return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_inodes_count);
+	return le32_to_cpu(EXT3_SB(sb)->s_es->s_free_inodes_count);
 #endif
 }
 
@@ -551,11 +551,11 @@ void ext3_check_inodes_bitmap (struct super_block * sb)
 	struct ext3_group_desc * gdp;
 	int i;
 
-	es = sb->u.ext3_sb.s_es;
+	es = EXT3_SB(sb)->s_es;
 	desc_count = 0;
 	bitmap_count = 0;
 	gdp = NULL;
-	for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
+	for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) {
 		gdp = ext3_get_group_desc (sb, i, NULL);
 		if (!gdp)
 			continue;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 681d0dc715b3..99157f9cb6e7 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -471,7 +471,7 @@ static inline unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
 	 * the same cylinder group then.
 	 */
 	return (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
-	       le32_to_cpu(inode->i_sb->u.ext3_sb.s_es->s_first_data_block);
+	       le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
 }
 
 /**
@@ -2141,20 +2141,20 @@ int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc)
 		inode->i_ino != EXT3_JOURNAL_INO &&
 		inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) ||
 		inode->i_ino > le32_to_cpu(
-			inode->i_sb->u.ext3_sb.s_es->s_inodes_count)) {
+			EXT3_SB(inode->i_sb)->s_es->s_inodes_count)) {
 		ext3_error (inode->i_sb, "ext3_get_inode_loc",
 			    "bad inode number: %lu", inode->i_ino);
 		goto bad_inode;
 	}
 	block_group = (inode->i_ino - 1) / EXT3_INODES_PER_GROUP(inode->i_sb);
-	if (block_group >= inode->i_sb->u.ext3_sb.s_groups_count) {
+	if (block_group >= EXT3_SB(inode->i_sb)->s_groups_count) {
 		ext3_error (inode->i_sb, "ext3_get_inode_loc",
 			    "group >= groups count");
 		goto bad_inode;
 	}
 	group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(inode->i_sb);
 	desc = block_group & (EXT3_DESC_PER_BLOCK(inode->i_sb) - 1);
-	bh = inode->i_sb->u.ext3_sb.s_group_desc[group_desc];
+	bh = EXT3_SB(inode->i_sb)->s_group_desc[group_desc];
 	if (!bh) {
 		ext3_error (inode->i_sb, "ext3_get_inode_loc",
 			    "Descriptor not loaded");
@@ -2224,7 +2224,7 @@ void ext3_read_inode(struct inode * inode)
 	 */
 	if (inode->i_nlink == 0) {
 		if (inode->i_mode == 0 ||
-		    !(inode->i_sb->u.ext3_sb.s_mount_state & EXT3_ORPHAN_FS)) {
+		    !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
 			/* this inode is deleted */
 			brelse (bh);
 			goto bad_inode;
@@ -2394,7 +2394,7 @@ static int ext3_do_update_inode(handle_t *handle,
 				* created, add a flag to the superblock.
 				*/
 				err = ext3_journal_get_write_access(handle,
-						sb->u.ext3_sb.s_sbh);
+						EXT3_SB(sb)->s_sbh);
 				if (err)
 					goto out_brelse;
 				ext3_update_dynamic_rev(sb);
@@ -2403,7 +2403,7 @@ static int ext3_do_update_inode(handle_t *handle,
 				sb->s_dirt = 1;
 				handle->h_sync = 1;
 				err = ext3_journal_dirty_metadata(handle,
-						sb->u.ext3_sb.s_sbh);
+						EXT3_SB(sb)->s_sbh);
 			}
 		}
 	}
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 9fe481b632d7..5d74409cef07 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -159,12 +159,12 @@ flags_err:
 			int ret = 0;
 
 			set_current_state(TASK_INTERRUPTIBLE);
-			add_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait);
-			if (timer_pending(&sb->u.ext3_sb.turn_ro_timer)) {
+			add_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait);
+			if (timer_pending(&EXT3_SB(sb)->turn_ro_timer)) {
 				schedule();
 				ret = 1;
 			}
-			remove_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait);
+			remove_wait_queue(&EXT3_SB(sb)->ro_wait_queue, &wait);
 			return ret;
 		}
 #endif
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index c64049f3f2d2..a2c72d68dd37 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -729,8 +729,8 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode)
 	J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 		S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
 
-	BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
-	err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
+	BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access");
+	err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
 	if (err)
 		goto out_unlock;
 	
@@ -741,7 +741,7 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode)
 	/* Insert this inode at the head of the on-disk orphan list... */
 	NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan);
 	EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
-	err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
+	err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
 	rc = ext3_mark_iloc_dirty(handle, inode, &iloc);
 	if (!err)
 		err = rc;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 1886280a074b..81c5537affd7 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -120,7 +120,7 @@ static int ext3_error_behaviour(struct super_block *sb)
 	/* If no overrides were specified on the mount, then fall back
 	 * to the default behaviour set in the filesystem's superblock
 	 * on disk. */
-	switch (le16_to_cpu(sb->u.ext3_sb.s_es->s_errors)) {
+	switch (le16_to_cpu(EXT3_SB(sb)->s_es->s_errors)) {
 	case EXT3_ERRORS_PANIC:
 		return EXT3_ERRORS_PANIC;
 	case EXT3_ERRORS_RO:
@@ -268,9 +268,9 @@ void ext3_abort (struct super_block * sb, const char * function,
 		return;
 	
 	printk (KERN_CRIT "Remounting filesystem read-only\n");
-	sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS;
+	EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
 	sb->s_flags |= MS_RDONLY;
-	sb->u.ext3_sb.s_mount_opt |= EXT3_MOUNT_ABORT;
+	EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
 	journal_abort(EXT3_SB(sb)->s_journal, -EIO);
 }
 
@@ -439,7 +439,8 @@ void ext3_put_super (struct super_block * sb)
 		ext3_blkdev_remove(sbi);
 	}
 	clear_ro_after(sb);
-
+	sb->u.generic_sbp = NULL;
+	kfree(sbi);
 	return;
 }
 
@@ -877,7 +878,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
 		sb->s_flags &= ~MS_RDONLY;
 	}
 
-	if (sb->u.ext3_sb.s_mount_state & EXT3_ERROR_FS) {
+	if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) {
 		if (es->s_last_orphan)
 			jbd_debug(1, "Errors on filesystem, "
 				  "clearing orphan list.\n");
@@ -949,7 +950,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 {
 	struct buffer_head * bh;
 	struct ext3_super_block *es = 0;
-	struct ext3_sb_info *sbi = EXT3_SB(sb);
+	struct ext3_sb_info *sbi;
 	unsigned long sb_block = 1;
 	unsigned long logic_sb_block = 1;
 	unsigned long offset = 0;
@@ -970,7 +971,11 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	 * This is important for devices that have a hardware
 	 * sectorsize that is larger than the default.
 	 */
-
+	sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+	sb->u.generic_sbp = sbi;
+	memset(sbi, 0, sizeof(*sbi));
 	sbi->s_mount_opt = 0;
 	sbi->s_resuid = EXT3_DEF_RESUID;
 	sbi->s_resgid = EXT3_DEF_RESGID;
@@ -1266,6 +1271,8 @@ failed_mount:
 	ext3_blkdev_remove(sbi);
 	brelse(bh);
 out_fail:
+	sb->u.generic_sbp = NULL;
+	kfree(sbi);
 	return -EINVAL;
 }
 
@@ -1520,11 +1527,11 @@ static void ext3_commit_super (struct super_block * sb,
 			       int sync)
 {
 	es->s_wtime = cpu_to_le32(CURRENT_TIME);
-	BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "marking dirty");
-	mark_buffer_dirty(sb->u.ext3_sb.s_sbh);
+	BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "marking dirty");
+	mark_buffer_dirty(EXT3_SB(sb)->s_sbh);
 	if (sync) {
-		ll_rw_block(WRITE, 1, &sb->u.ext3_sb.s_sbh);
-		wait_on_buffer(sb->u.ext3_sb.s_sbh);
+		ll_rw_block(WRITE, 1, &EXT3_SB(sb)->s_sbh);
+		wait_on_buffer(EXT3_SB(sb)->s_sbh);
 	}
 }
 
@@ -1575,7 +1582,7 @@ static void ext3_clear_journal_err(struct super_block * sb,
 		ext3_warning(sb, __FUNCTION__, "Marking fs in need of "
 			     "filesystem check.");
 		
-		sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS;
+		EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
 		es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
 		ext3_commit_super (sb, es, 1);
 
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index e05c9d984080..d4550e28f37e 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -97,9 +97,9 @@
 # define EXT3_BLOCK_SIZE_BITS(s)	((s)->s_log_block_size + 10)
 #endif
 #ifdef __KERNEL__
-#define	EXT3_ADDR_PER_BLOCK_BITS(s)	((s)->u.ext3_sb.s_addr_per_block_bits)
-#define EXT3_INODE_SIZE(s)		((s)->u.ext3_sb.s_inode_size)
-#define EXT3_FIRST_INO(s)		((s)->u.ext3_sb.s_first_ino)
+#define	EXT3_ADDR_PER_BLOCK_BITS(s)	(EXT3_SB(s)->s_addr_per_block_bits)
+#define EXT3_INODE_SIZE(s)		(EXT3_SB(s)->s_inode_size)
+#define EXT3_FIRST_INO(s)		(EXT3_SB(s)->s_first_ino)
 #else
 #define EXT3_INODE_SIZE(s)	(((s)->s_rev_level == EXT3_GOOD_OLD_REV) ? \
 				 EXT3_GOOD_OLD_INODE_SIZE : \
@@ -116,8 +116,8 @@
 #define	EXT3_MAX_FRAG_SIZE		4096
 #define EXT3_MIN_FRAG_LOG_SIZE		  10
 #ifdef __KERNEL__
-# define EXT3_FRAG_SIZE(s)		((s)->u.ext3_sb.s_frag_size)
-# define EXT3_FRAGS_PER_BLOCK(s)	((s)->u.ext3_sb.s_frags_per_block)
+# define EXT3_FRAG_SIZE(s)		(EXT3_SB(s)->s_frag_size)
+# define EXT3_FRAGS_PER_BLOCK(s)	(EXT3_SB(s)->s_frags_per_block)
 #else
 # define EXT3_FRAG_SIZE(s)		(EXT3_MIN_FRAG_SIZE << (s)->s_log_frag_size)
 # define EXT3_FRAGS_PER_BLOCK(s)	(EXT3_BLOCK_SIZE(s) / EXT3_FRAG_SIZE(s))
@@ -164,10 +164,10 @@ struct ext3_group_desc
  * Macro-instructions used to manage group descriptors
  */
 #ifdef __KERNEL__
-# define EXT3_BLOCKS_PER_GROUP(s)	((s)->u.ext3_sb.s_blocks_per_group)
-# define EXT3_DESC_PER_BLOCK(s)		((s)->u.ext3_sb.s_desc_per_block)
-# define EXT3_INODES_PER_GROUP(s)	((s)->u.ext3_sb.s_inodes_per_group)
-# define EXT3_DESC_PER_BLOCK_BITS(s)	((s)->u.ext3_sb.s_desc_per_block_bits)
+# define EXT3_BLOCKS_PER_GROUP(s)	(EXT3_SB(s)->s_blocks_per_group)
+# define EXT3_DESC_PER_BLOCK(s)		(EXT3_SB(s)->s_desc_per_block)
+# define EXT3_INODES_PER_GROUP(s)	(EXT3_SB(s)->s_inodes_per_group)
+# define EXT3_DESC_PER_BLOCK_BITS(s)	(EXT3_SB(s)->s_desc_per_block_bits)
 #else
 # define EXT3_BLOCKS_PER_GROUP(s)	((s)->s_blocks_per_group)
 # define EXT3_DESC_PER_BLOCK(s)		(EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_group_desc))
@@ -346,7 +346,7 @@ struct ext3_inode {
 #ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)		o &= ~EXT3_MOUNT_##opt
 #define set_opt(o, opt)			o |= EXT3_MOUNT_##opt
-#define test_opt(sb, opt)		((sb)->u.ext3_sb.s_mount_opt & \
+#define test_opt(sb, opt)		(EXT3_SB(sb)->s_mount_opt & \
 					 EXT3_MOUNT_##opt)
 #else
 #define EXT2_MOUNT_NOLOAD		EXT3_MOUNT_NOLOAD
@@ -444,7 +444,10 @@ struct ext3_super_block {
 };
 
 #ifdef __KERNEL__
-#define EXT3_SB(sb)	(&((sb)->u.ext3_sb))
+static inline struct ext3_sb_info * EXT3_SB(struct super_block *sb)
+{
+	return sb->u.generic_sbp;
+}
 static inline struct ext3_inode_info *EXT3_I(struct inode *inode)
 {
 	return container_of(inode, struct ext3_inode_info, vfs_inode);
-- 
cgit v1.2.3


From 5868caf6dc1c08a70b6b83d6fd420ea2625cb408 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 15 Sep 2002 08:51:02 -0700
Subject: [PATCH] add dump_stack(): cross-arch backtrace

From Christoph Hellwig, also present in 2.4.

Create an arch-independent `dump_stack()' function.  So we don't need to do

#ifdef CONFIG_X86
	show_stack(0);		/* No prototype in scope! */
#endif

any more.

The whole dump_stack() implementation is delegated to the architecture.
If it doesn't provide one, there is a default do-nothing library
function.
---
 arch/alpha/kernel/traps.c |  5 +++++
 arch/cris/kernel/traps.c  |  6 +++++-
 arch/i386/kernel/traps.c  |  8 ++++++++
 fs/buffer.c               |  4 +---
 include/linux/kernel.h    |  2 ++
 kernel/ksyms.c            |  3 +++
 lib/Makefile              |  2 +-
 lib/dump_stack.c          | 13 +++++++++++++
 8 files changed, 38 insertions(+), 5 deletions(-)
 create mode 100644 lib/dump_stack.c

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c
index c90a631a8720..5bce21cd8e83 100644
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -171,6 +171,11 @@ void show_stack(unsigned long *sp)
 	dik_show_trace(sp);
 }
 
+void dump_stack(void)
+{
+	show_stack(NULL);
+}
+
 void
 die_if_kernel(char * str, struct pt_regs *regs, long err, unsigned long *r9_15)
 {
diff --git a/arch/cris/kernel/traps.c b/arch/cris/kernel/traps.c
index cae9ff413182..9666d32af626 100644
--- a/arch/cris/kernel/traps.c
+++ b/arch/cris/kernel/traps.c
@@ -230,8 +230,12 @@ watchdog_bite_hook(struct pt_regs *regs)
 #endif	
 }
 
-/* This is normally the 'Oops' routine */
+void dump_stack(void)
+{
+	show_stack(NULL);
+}
 
+/* This is normally the 'Oops' routine */
 void 
 die_if_kernel(const char * str, struct pt_regs * regs, long err)
 {
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 032514876dc3..3329600c7897 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -189,6 +189,14 @@ void show_stack(unsigned long * esp)
 	show_trace(esp);
 }
 
+/*
+ * The architecture-independent dump_stack generator
+ */
+void dump_stack(void)
+{
+	show_stack(0);
+}
+
 void show_registers(struct pt_regs *regs)
 {
 	int i;
diff --git a/fs/buffer.c b/fs/buffer.c
index e07348ca9428..22272f97b0e4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -61,10 +61,8 @@ void __buffer_error(char *file, int line)
 		return;
 	enough++;
 	printk("buffer layer error at %s:%d\n", file, line);
-#ifdef CONFIG_X86
 	printk("Pass this trace through ksymoops for reporting\n");
-	show_stack(0);
-#endif
+	dump_stack();
 }
 EXPORT_SYMBOL(__buffer_error);
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 95965970e49b..5efa540d55f8 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -96,6 +96,8 @@ extern const char *print_tainted(void);
 #define TAINT_FORCED_MODULE		(1<<1)
 #define TAINT_UNSAFE_SMP		(1<<2)
 
+extern void dump_stack(void);
+
 #if DEBUG
 #define pr_debug(fmt,arg...) \
 	printk(KERN_DEBUG fmt,##arg)
diff --git a/kernel/ksyms.c b/kernel/ksyms.c
index d69272a39175..61b045b1dd69 100644
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -605,3 +605,6 @@ EXPORT_SYMBOL(pidhash);
 #if defined(CONFIG_SMP) && defined(__GENERIC_PER_CPU)
 EXPORT_SYMBOL(__per_cpu_offset);
 #endif
+
+/* debug */
+EXPORT_SYMBOL(dump_stack);
diff --git a/lib/Makefile b/lib/Makefile
index f16f1093e487..6589280083dd 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -12,7 +12,7 @@ export-objs := cmdline.o dec_and_lock.o rwsem-spinlock.o rwsem.o \
 	       crc32.o rbtree.o radix-tree.o
 
 obj-y := errno.o ctype.o string.o vsprintf.o brlock.o cmdline.o \
-	 bust_spinlocks.o rbtree.o radix-tree.o
+	 bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o
 
 obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
diff --git a/lib/dump_stack.c b/lib/dump_stack.c
new file mode 100644
index 000000000000..47c2728ab8c7
--- /dev/null
+++ b/lib/dump_stack.c
@@ -0,0 +1,13 @@
+/*
+ * Provide a default dump_stack() function for architectures
+ * which don't implement their own.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+void dump_stack(void)
+{
+	printk(KERN_NOTICE
+		"This architecture does not implement dump_stack()\n");
+}
-- 
cgit v1.2.3


From 16b387466416560dbb6d100709fec09ec90412db Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Sun, 15 Sep 2002 08:51:06 -0700
Subject: [PATCH] various small cleanups

- Remove defunct active_list/inactive_list declarations (wli)

- Update an obsolete comment (wli)

- "mm/slab.c contains one leftover from the initial version with
  'unsigned short' bufctl entries.  The attached patch replaces '2'
  with the correct sizeof [which is now 4]" - Manfred Spraul

- BUG checks for vfree/vunmap being called in interrupt context
  (because they take irq-unsafe spinlocks, I guess?) - davej

- Simplify some coding in one_highpage_init() (Christoph Hellwig).
---
 arch/i386/mm/init.c    | 19 +++++++------------
 include/linux/mm.h     |  3 ---
 include/linux/mmzone.h |  4 ++--
 mm/slab.c              |  2 +-
 mm/vmalloc.c           |  4 ++++
 5 files changed, 14 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 0d026c67522c..5d73c07fd726 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -215,19 +215,14 @@ void __init permanent_kmaps_init(pgd_t *pgd_base)
 
 void __init one_highpage_init(struct page *page, int pfn, int bad_ppro)
 {
-	if (!page_is_ram(pfn)) {
+	if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
+		ClearPageReserved(page);
+		set_bit(PG_highmem, &page->flags);
+		set_page_count(page, 1);
+		__free_page(page);
+		totalhigh_pages++;
+	} else
 		SetPageReserved(page);
-		return;
-	}
-	if (bad_ppro && page_kills_ppro(pfn)) {
-		SetPageReserved(page);
-		return;
-	}
-	ClearPageReserved(page);
-	set_bit(PG_highmem, &page->flags);
-	atomic_set(&page->count, 1);
-	__free_page(page);
-	totalhigh_pages++;
 }
 
 #ifndef CONFIG_DISCONTIGMEM
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c90392074cc2..c7b4b52b0e3d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -19,9 +19,6 @@ extern unsigned long max_mapnr;
 extern unsigned long num_physpages;
 extern void * high_memory;
 extern int page_cluster;
-/* The inactive_clean lists are per zone. */
-extern struct list_head active_list;
-extern struct list_head inactive_list;
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b271e144972a..8ebf441bdb47 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -151,8 +151,8 @@ struct zonelist {
  * On NUMA machines, each NUMA node would have a pg_data_t to describe
  * it's memory layout.
  *
- * XXX: we need to move the global memory statistics (active_list, ...)
- *      into the pg_data_t to properly support NUMA.
+ * Memory statistics and page replacement data structures are maintained on a
+ * per-zone basis.
  */
 struct bootmem_data;
 typedef struct pglist_data {
diff --git a/mm/slab.c b/mm/slab.c
index f15e274aba6c..da75d9e11523 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -487,7 +487,7 @@ void __init kmem_cache_sizes_init(void)
 		/* Inc off-slab bufctl limit until the ceiling is hit. */
 		if (!(OFF_SLAB(sizes->cs_cachep))) {
 			offslab_limit = sizes->cs_size-sizeof(slab_t);
-			offslab_limit /= 2;
+			offslab_limit /= sizeof(kmem_bufctl_t);
 		}
 		sizes->cs_dmacachep = kmem_cache_create(
 		    cache_names[sizes-cache_sizes].name_dma, 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d2dff973429b..0f095c168d24 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -11,6 +11,8 @@
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/interrupt.h>
+
 #include <linux/vmalloc.h>
 
 #include <asm/uaccess.h>
@@ -309,6 +311,7 @@ void __vunmap(void *addr, int deallocate_pages)
  */
 void vfree(void *addr)
 {
+	BUG_ON(in_interrupt());
 	__vunmap(addr, 1);
 }
 
@@ -324,6 +327,7 @@ void vfree(void *addr)
  */
 void vunmap(void *addr)
 {
+	BUG_ON(in_interrupt());
 	__vunmap(addr, 0);
 }
 
-- 
cgit v1.2.3


From 6865038acb61584977f8352418f06b9dedf267b0 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Sun, 15 Sep 2002 08:53:19 -0700
Subject: [PATCH] Remove CONFIG_SMP around wait_task_inactive()

Linus, please apply.  This defines wait_task_inactive() to be a no-op
on UP machines, and removes the #ifdef CONFIG_SMP which surrounds
current calls.

This also fixes compile on UP which was broken by the addition of a
call to wait_task_inactive in fs/exec.c which was not protected by an
#ifdef.
---
 arch/ia64/kernel/perfmon.c | 11 ++++-------
 include/linux/sched.h      |  4 ++++
 kernel/exit.c              |  2 --
 kernel/ptrace.c            |  2 --
 4 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index f1c6f6bedbf0..361d93363f5a 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2345,22 +2345,19 @@ static int
 check_task_state(struct task_struct *task)
 {
 	int ret = 0;
-#ifdef CONFIG_SMP
+
 	/* We must wait until the state has been completely
 	 * saved. There can be situations where the reader arrives before
 	 * after the task is marked as STOPPED but before pfm_save_regs()
 	 * is completed.
 	 */
-	if (task->state != TASK_ZOMBIE && task->state != TASK_STOPPED) return -EBUSY;
-	DBprintk(("before wait_task_inactive [%d] state %ld\n", task->pid, task->state));
-	wait_task_inactive(task);
-	DBprintk(("after wait_task_inactive [%d] state %ld\n", task->pid, task->state));
-#else
 	if (task->state != TASK_ZOMBIE && task->state != TASK_STOPPED) {
 		DBprintk(("warning [%d] not in stable state %ld\n", task->pid, task->state));
 		ret = -EBUSY;
 	}
-#endif
+	DBprintk(("before wait_task_inactive [%d] state %ld\n", task->pid, task->state));
+	wait_task_inactive(task);
+	DBprintk(("after wait_task_inactive [%d] state %ld\n", task->pid, task->state));
 	return ret;
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b132cc3952ea..528072a7910b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -690,7 +690,11 @@ extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
 extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
 extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
 
+#ifdef CONFIG_SMP
 extern void wait_task_inactive(task_t * p);
+#else
+#define wait_task_inactive(p)	do { } while (0)
+#endif
 extern void kick_if_running(task_t * p);
 
 #define __wait_event(wq, condition) 					\
diff --git a/kernel/exit.c b/kernel/exit.c
index cea498d24e39..8ab794864afd 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,10 +55,8 @@ void release_task(struct task_struct * p)
 
 	if (p->state != TASK_ZOMBIE)
 		BUG();
-#ifdef CONFIG_SMP
 	if (p != current)
 		wait_task_inactive(p);
-#endif
 	atomic_dec(&p->user->processes);
 	security_ops->task_free_security(p);
 	free_uid(p->user);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 4feba1214dc0..cc7a76bb65b7 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -69,9 +69,7 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 	if (!kill) {
 		if (child->state != TASK_STOPPED)
 			return -ESRCH;
-#ifdef CONFIG_SMP
 		wait_task_inactive(child);
-#endif		
 	}
 
 	/* All systems go.. */
-- 
cgit v1.2.3


From aa509d0d6e2a590bc9fe466e770d0223ec3b5f10 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@burns.home.kernel.dk>
Date: Mon, 16 Sep 2002 18:56:48 +0200
Subject: New IDE pci low level driver setup scheme

---
 drivers/ide/pci/aec62xx.c     |  77 ++++++++++++----
 drivers/ide/pci/aec62xx.h     |  15 +--
 drivers/ide/pci/alim15x3.c    |  70 +++++++++-----
 drivers/ide/pci/alim15x3.h    |   4 +-
 drivers/ide/pci/amd74xx.c     |  63 ++++++++++---
 drivers/ide/pci/amd74xx.h     |  16 +---
 drivers/ide/pci/cmd64x.c      |  72 ++++++++++-----
 drivers/ide/pci/cmd64x.h      |  11 +--
 drivers/ide/pci/cs5530.c      |  71 +++++++++------
 drivers/ide/pci/cs5530.h      |   4 +-
 drivers/ide/pci/cy82c693.c    |  63 +++++++++----
 drivers/ide/pci/cy82c693.h    |   4 +-
 drivers/ide/pci/generic.c     |  88 +++++++++++++++---
 drivers/ide/pci/generic.h     |  20 ++--
 drivers/ide/pci/hpt34x.c      |  62 +++++++++----
 drivers/ide/pci/hpt34x.h      |   4 +-
 drivers/ide/pci/hpt366.c      |  78 +++++++++++++---
 drivers/ide/pci/hpt366.h      |  10 +-
 drivers/ide/pci/it8172.c      |  58 +++++++++---
 drivers/ide/pci/it8172.h      |   2 +-
 drivers/ide/pci/ns87415.c     |  60 +++++++++---
 drivers/ide/pci/ns87415.h     |   4 +-
 drivers/ide/pci/nvidia.c      |  63 +++++++++----
 drivers/ide/pci/nvidia.h      |   8 +-
 drivers/ide/pci/opti621.c     |  64 ++++++++++---
 drivers/ide/pci/opti621.h     |   4 +-
 drivers/ide/pci/pdcadma.c     |  63 +++++++++----
 drivers/ide/pci/pdcadma.h     |   2 +-
 drivers/ide/pci/piix.c        |  87 ++++++++++++++----
 drivers/ide/pci/piix.h        |  42 +++++----
 drivers/ide/pci/rz1000.c      |  64 +++++++++----
 drivers/ide/pci/rz1000.h      |   3 -
 drivers/ide/pci/serverworks.c |  81 ++++++++++++++---
 drivers/ide/pci/serverworks.h |   8 +-
 drivers/ide/pci/siimage.c     |  62 ++++++++++---
 drivers/ide/pci/siimage.h     |   7 +-
 drivers/ide/pci/sis5513.c     |  61 ++++++++++---
 drivers/ide/pci/sis5513.h     |   4 +-
 drivers/ide/pci/sl82c105.c    |  57 +++++++++---
 drivers/ide/pci/sl82c105.h    |   4 +-
 drivers/ide/pci/slc90e66.c    |  61 ++++++++++---
 drivers/ide/pci/slc90e66.h    |   4 +-
 drivers/ide/pci/trm290.c      |  62 +++++++++----
 drivers/ide/pci/trm290.h      |   4 +-
 drivers/ide/pci/via82cxxx.c   |  67 +++++++++-----
 drivers/ide/pci/via82cxxx.h   |   7 +-
 drivers/ide/setup-pci.c       | 206 ++++++++++++------------------------------
 include/linux/ide.h           |  25 +----
 48 files changed, 1293 insertions(+), 683 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/pci/aec62xx.c b/drivers/ide/pci/aec62xx.c
index dccd665d81b4..770b05852633 100644
--- a/drivers/ide/pci/aec62xx.c
+++ b/drivers/ide/pci/aec62xx.c
@@ -38,8 +38,6 @@ static int aec62xx_get_info (char *buffer, char **addr, off_t offset, int count)
 	char *chipset_nums[] = {"error", "error", "error", "error",
 				"error", "error", "850UF",   "860",
 				 "860R",   "865",  "865R", "error"  };
-//	char *modes_33[] = {};
-//	char *modes_34[] = {};
 	int i;
 
 	for (i = 0; i < n_aec_devs; i++) {
@@ -516,22 +514,69 @@ static void __init init_setup_aec6x80 (struct pci_dev *dev, ide_pci_device_t *d)
 	ide_setup_pci_device(dev, d);
 }
 
-int __init aec62xx_scan_pcidev (struct pci_dev *dev)
+/**
+ *	aec62xx_init_one	-	called when a AEC is found
+ *	@dev: the aec62xx device
+ *	@id: the matching pci id
+ *
+ *	Called when the PCI registration layer (or the IDE initialization)
+ *	finds a device matching our IDE device tables.
+ */
+ 
+static int __devinit aec62xx_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
-	ide_pci_device_t *d;
+	ide_pci_device_t *d = &aec62xx_chipsets[id->driver_data];
 
-	if (dev->vendor != PCI_VENDOR_ID_ARTOP)
-		return 0;
-
-	for (d = aec62xx_chipsets;
-		d && d->vendor && d->device; ++d) {
-		if (((d->vendor == dev->vendor) &&
-		     (d->device == dev->device)) &&
-		    (d->init_setup)) {
-			d->init_setup(dev, d);
-			return 1;
-		}
-	}
+	if (dev->device != d->device)
+		BUG();
+	d->init_setup(dev, d);
 	return 0;
 }
 
+/**
+ *	aec62xx_remove_one	-	called when an AEC is unplugged
+ *	@dev: the device that was removed
+ *
+ *	Disconnect an AEC device that has been unplugged either by hotplug
+ *	or by a more civilized notification scheme. Not yet supported.
+ */
+ 
+static void aec62xx_remove_one(struct pci_dev *dev)
+{
+	panic("AEC62xx removal not yet supported");
+}
+
+static struct pci_device_id aec62xx_pci_tbl[] __devinitdata = {
+	{ PCI_VENDOR_ID_ARTOP, PCI_DEVICE_ID_ARTOP_ATP850UF, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 },
+	{ PCI_VENDOR_ID_ARTOP, PCI_DEVICE_ID_ARTOP_ATP860,   PCI_ANY_ID, PCI_ANY_ID, 0, 0, 1 },
+	{ PCI_VENDOR_ID_ARTOP, PCI_DEVICE_ID_ARTOP_ATP860R,  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 2 },
+	{ PCI_VENDOR_ID_ARTOP, PCI_DEVICE_ID_ARTOP_ATP865,   PCI_ANY_ID, PCI_ANY_ID, 0, 0, 3 },
+	{ PCI_VENDOR_ID_ARTOP, PCI_DEVICE_ID_ARTOP_ATP865R,  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 4 },
+	{ 0, },
+};
+
+static struct pci_driver driver = {
+	name:		"AEC62xx IDE",
+	id_table:	aec62xx_pci_tbl,
+	probe:		aec62xx_init_one,
+	remove:		__devexit_p(aec62xx_remove_one),
+};
+
+static int aec62xx_ide_init(void)
+{
+	return ide_pci_register_driver(&driver);
+}
+
+static void aec62xx_ide_exit(void)
+{
+	ide_pci_unregister_driver(&driver);
+}
+
+module_init(aec62xx_ide_init);
+module_exit(aec62xx_ide_exit);
+
+MODULE_AUTHOR("Andre Hedrick");
+MODULE_DESCRIPTION("PCI driver module for ARTOP AEC62xx IDE");
+MODULE_LICENSE("GPL");
+
+EXPORT_NO_SYMBOLS;
diff --git a/drivers/ide/pci/aec62xx.h b/drivers/ide/pci/aec62xx.h
index 27d31eb6640f..8e149c089015 100644
--- a/drivers/ide/pci/aec62xx.h
+++ b/drivers/ide/pci/aec62xx.h
@@ -99,7 +99,7 @@ static void init_hwif_aec62xx(ide_hwif_t *);
 static void init_dma_aec62xx(ide_hwif_t *, unsigned long);
 
 static ide_pci_device_t aec62xx_chipsets[] __initdata = {
-	{
+	{	/* 0 */
 		vendor:		PCI_VENDOR_ID_ARTOP,
 		device:		PCI_DEVICE_ID_ARTOP_ATP850UF,
 		name:		"AEC6210",
@@ -113,7 +113,7 @@ static ide_pci_device_t aec62xx_chipsets[] __initdata = {
 		enablebits:	{{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
 		bootable:	OFF_BOARD,
 		extra:		0,
-	},{
+	},{	/* 1 */
 		vendor:		PCI_VENDOR_ID_ARTOP,
 		device:		PCI_DEVICE_ID_ARTOP_ATP860,
 		name:		"AEC6260",
@@ -127,7 +127,7 @@ static ide_pci_device_t aec62xx_chipsets[] __initdata = {
 		enablebits:	{{0x00,0x00,0x00}, {0x00,0x00,0x00}},
 		bootable:	OFF_BOARD,
 		extra:		0,
-	},{
+	},{	/* 2 */
 		vendor:		PCI_VENDOR_ID_ARTOP,
 		device:		PCI_DEVICE_ID_ARTOP_ATP860R,
 		name:		"AEC6260R",
@@ -141,7 +141,7 @@ static ide_pci_device_t aec62xx_chipsets[] __initdata = {
 		enablebits:	{{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
 		bootable:	NEVER_BOARD,
 		extra:		0,
-	},{
+	},{	/* 3 */
 		vendor:		PCI_VENDOR_ID_ARTOP,
 		device:		PCI_DEVICE_ID_ARTOP_ATP865,
 		name:		"AEC6X80",
@@ -155,7 +155,7 @@ static ide_pci_device_t aec62xx_chipsets[] __initdata = {
 		enablebits:	{{0x00,0x00,0x00}, {0x00,0x00,0x00}},
 		bootable:	OFF_BOARD,
 		extra:		0,
-	},{
+	},{	/* 4 */
 		vendor:		PCI_VENDOR_ID_ARTOP,
 		device:		PCI_DEVICE_ID_ARTOP_ATP865R,
 		name:		"AEC6X80R",
@@ -169,11 +169,6 @@ static ide_pci_device_t aec62xx_chipsets[] __initdata = {
 		enablebits:	{{0x4a,0x02,0x02}, {0x4a,0x04,0x04}},
 		bootable:	OFF_BOARD,
 		extra:		0,
-	},{
-		vendor:		0,
-		device:		0,
-		channels:	0,
-		bootable:	EOL,
 	}
 };
 
diff --git a/drivers/ide/pci/alim15x3.c b/drivers/ide/pci/alim15x3.c
index f543761bb15e..ba5aa443124a 100644
--- a/drivers/ide/pci/alim15x3.c
+++ b/drivers/ide/pci/alim15x3.c
@@ -22,6 +22,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/pci.h>
@@ -623,9 +624,11 @@ static unsigned int __init init_chipset_ali15x3 (struct pci_dev *dev, const char
 
 	/*
 	 * We should only tune the 1533 enable if we are using an ALi
-	 * North bridge
+	 * North bridge. We might have no north found on some zany
+	 * box without a device at 0:0.0. The ALi bridge will be at
+	 * 0:0.0 so if we didn't find one we know what is cooking.
 	 */
-	if (north->vendor != PCI_VENDOR_ID_AL) {
+	if (north && north->vendor != PCI_VENDOR_ID_AL) {
 		local_irq_restore(flags);
 	        return 0;
 	}
@@ -841,45 +844,64 @@ extern void ide_setup_pci_device(struct pci_dev *, ide_pci_device_t *);
 
 
 /**
- *	init_setup_ali15x3	-	set up an ALi15x3 IDE controller
+ *	alim15x3_init_one	-	set up an ALi15x3 IDE controller
  *	@dev: PCI device to set up
- *	@d: IDE PCI structures
  *
- *	Perform the actual set up for the ALi15x3.
+ *	Perform the actual set up for an ALi15x3 that has been found by the
+ *	hot plug layer.
  */
  
-static void __init init_setup_ali15x3 (struct pci_dev *dev, ide_pci_device_t *d)
+static int __devinit alim15x3_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
+	ide_pci_device_t *d = &ali15x3_chipsets[id->driver_data];
 #if defined(CONFIG_SPARC64)
 	d->init_hwif = init_hwif_common_ali15x3;
 #endif /* CONFIG_SPARC64 */
 	ide_setup_pci_device(dev, d);
+	return 0;
 }
 
+
 /**
- *	ali15x3_scan_pcidev	-	check for ali pci ide
- *	@dev: device found by IDE ordered scan
+ *	ali15x3_remove_one	-	called with an ALi is unplugged
+ *	@dev: the device that was removed
  *
- *	If the device is a known ALi IDE controller we set it up and
- *	then return 1. If we do not know it, or set up fails we return 0
- *	and it will be offered to other drivers or taken generic
+ *	Disconnect an ALi device that has been unplugged either by hotplug
+ *	or by a more civilized notification scheme. Not yet supported.
  */
  
-int __init ali15x3_scan_pcidev (struct pci_dev *dev)
+static void ali15x3_remove_one(struct pci_dev *dev)
 {
-	ide_pci_device_t *d;
+	panic("ALi removal not yet supported");
+}
 
-	if (dev->vendor != PCI_VENDOR_ID_AL)
-		return 0;
+static struct pci_device_id alim15x3_pci_tbl[] __devinitdata = {
+	{ PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M5229, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 },
+	{ 0, },
+};
 
-	for (d = ali15x3_chipsets; d && d->vendor && d->device; ++d) {
-		if (((d->vendor == dev->vendor) &&
-		     (d->device == dev->device)) &&
-		    (d->init_setup)) {
-			d->init_setup(dev, d);
-			return 1;
-		}
-	}
-	return 0;
+static struct pci_driver driver = {
+	name:		"ALI15x3 IDE",
+	id_table:	alim15x3_pci_tbl,
+	probe:		alim15x3_init_one,
+	remove:		__devexit_p(ali15x3_remove_one),
+};
+
+static int ali15x3_ide_init(void)
+{
+	return ide_pci_register_driver(&driver);
+}
+
+static void ali15x3_ide_exit(void)
+{
+	ide_pci_unregister_driver(&driver);
 }
 
+module_init(ali15x3_ide_init);
+module_exit(ali15x3_ide_exit);
+
+MODULE_AUTHOR("Michael Aubry, Andrzej Krzysztofowicz, CJ, Andre Hedrick, Alan Cox");
+MODULE_DESCRIPTION("PCI driver module for ALi 15x3 IDE");
+MODULE_LICENSE("GPL");
+
+EXPORT_NO_SYMBOLS;
diff --git a/drivers/ide/pci/alim15x3.h b/drivers/ide/pci/alim15x3.h
index c18ec2c0a84b..bbd8fa7b0c54 100644
--- a/drivers/ide/pci/alim15x3.h
+++ b/drivers/ide/pci/alim15x3.h
@@ -25,18 +25,16 @@ static ide_pci_host_proc_t ali_procs[] __initdata = {
 };
 #endif /* DISPLAY_ALI_TIMINGS && CONFIG_PROC_FS */
 
-static void init_setup_ali15x3(struct pci_dev *, ide_pci_device_t *);
 static unsigned int init_chipset_ali15x3(struct pci_dev *, const char *);
 static void init_hwif_common_ali15x3(ide_hwif_t *);
 static void init_hwif_ali15x3(ide_hwif_t *);
 static void init_dma_ali15x3(ide_hwif_t *, unsigned long);
 
 static ide_pci_device_t ali15x3_chipsets[] __initdata = {
-	{
+	{	/* 0 */
 		vendor:		PCI_VENDOR_ID_AL,
 		device:		PCI_DEVICE_ID_AL_M5229,
 		name:		"ALI15X3",
-		init_setup:	init_setup_ali15x3,
 		init_chipset:	init_chipset_ali15x3,
 		init_iops:	NULL,
 		init_hwif:	init_hwif_ali15x3,
diff --git a/drivers/ide/pci/amd74xx.c b/drivers/ide/pci/amd74xx.c
index c65d6e558341..3d456558c3ca 100644
--- a/drivers/ide/pci/amd74xx.c
+++ b/drivers/ide/pci/amd74xx.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
@@ -392,26 +393,60 @@ static void __init init_dma_amd74xx (ide_hwif_t *hwif, unsigned long dmabase)
 
 extern void ide_setup_pci_device(struct pci_dev *, ide_pci_device_t *);
 
-static void __init init_setup_amd74xx (struct pci_dev *dev, ide_pci_device_t *d)
+
+static int __devinit amd74xx_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
+	ide_pci_device_t *d = &amd74xx_chipsets[id->driver_data];
+	if (dev->device != d->device)
+		BUG();
 	ide_setup_pci_device(dev, d);
+	return 0;
 }
 
-int __init amd74xx_scan_pcidev (struct pci_dev *dev)
+/**
+ *	amd74xx_remove_one	-	called with an AMD IDE is unplugged
+ *	@dev: the device that was removed
+ *
+ *	Disconnect an AMD IDE device that has been unplugged either by hotplug
+ *	or by a more civilized notification scheme. Not yet supported.
+ */
+ 
+static void amd74xx_remove_one(struct pci_dev *dev)
 {
-	ide_pci_device_t *d;
+	panic("AMD IDE removal not yet supported");
+}
 
-	if (dev->vendor != PCI_VENDOR_ID_AMD)
-		return 0;
+static struct pci_device_id amd74xx_pci_tbl[] __devinitdata = {
+	{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_COBRA_7401,	PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_VIPER_7409,	PCI_ANY_ID, PCI_ANY_ID, 0, 0, 1},
+	{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_VIPER_7411,	PCI_ANY_ID, PCI_ANY_ID, 0, 0, 2},
+	{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_OPUS_7441,	PCI_ANY_ID, PCI_ANY_ID, 0, 0, 3},
+	{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_8111_IDE, 	PCI_ANY_ID, PCI_ANY_ID, 0, 0, 4},
+	{ 0, },
+};
+
+static struct pci_driver driver = {
+	name:		"AMD IDE",
+	id_table:	amd74xx_pci_tbl,
+	probe:		amd74xx_init_one,
+	remove:		__devexit_p(amd74xx_remove_one),
+};
+
+static int amd74xx_ide_init(void)
+{
+	return ide_pci_register_driver(&driver);
+}
 
-	for (d = amd74xx_chipsets; d && d->vendor && d->device; ++d) {
-		if (((d->vendor == dev->vendor) &&
-		     (d->device == dev->device)) &&
-		    (d->init_setup)) {
-			d->init_setup(dev, d);
-			return 1;
-		}
-	}
-	return 0;
+static void amd74xx_ide_exit(void)
+{
+	ide_pci_unregister_driver(&driver);
 }
 
+module_init(amd74xx_ide_init);
+module_exit(amd74xx_ide_exit);
+
+MODULE_AUTHOR("Andre Hedrick");
+MODULE_DESCRIPTION("PCI driver module for AMD IDE");
+MODULE_LICENSE("GPL");
+
+EXPORT_NO_SYMBOLS;
diff --git a/drivers/ide/pci/amd74xx.h b/drivers/ide/pci/amd74xx.h
index 9b26fe08fd68..bc2a50060fb2 100644
--- a/drivers/ide/pci/amd74xx.h
+++ b/drivers/ide/pci/amd74xx.h
@@ -25,17 +25,15 @@ static ide_pci_host_proc_t amd74xx_procs[] __initdata = {
 };
 #endif  /* defined(DISPLAY_VIPER_TIMINGS) && defined(CONFIG_PROC_FS) */
 
-static void init_setup_amd74xx(struct pci_dev *, ide_pci_device_t *);
 static unsigned int init_chipset_amd74xx(struct pci_dev *, const char *);
 static void init_hwif_amd74xx(ide_hwif_t *);
 static void init_dma_amd74xx(ide_hwif_t *, unsigned long);
 
 static ide_pci_device_t amd74xx_chipsets[] __initdata = {
-	{
+	{	/* 0 */
 		vendor:		PCI_VENDOR_ID_AMD,
 		device:		PCI_DEVICE_ID_AMD_COBRA_7401,
 		name:		"AMD7401",
-		init_setup:	init_setup_amd74xx,
 		init_chipset:	init_chipset_amd74xx,
 		init_iops:	NULL,
 		init_hwif:	init_hwif_amd74xx,
@@ -45,11 +43,10 @@ static ide_pci_device_t amd74xx_chipsets[] __initdata = {
 		enablebits:	{{0x40,0x01,0x01}, {0x40,0x02,0x02}},
 		bootable:	ON_BOARD,
 		extra:		0
-	},{
+	},{	/* 1 */
 		vendor:		PCI_VENDOR_ID_AMD,
 		device:		PCI_DEVICE_ID_AMD_VIPER_7409,
 		name:		"AMD7409",
-		init_setup:	init_setup_amd74xx,
 		init_chipset:	init_chipset_amd74xx,
 		init_iops:	NULL,
 		init_hwif:	init_hwif_amd74xx,
@@ -59,11 +56,10 @@ static ide_pci_device_t amd74xx_chipsets[] __initdata = {
 		enablebits:	{{0x40,0x01,0x01}, {0x40,0x02,0x02}},
 		bootable:	ON_BOARD,
 		extra:		0
-	},{
+	},{	/* 2 */
 		vendor:		PCI_VENDOR_ID_AMD,
 		device:		PCI_DEVICE_ID_AMD_VIPER_7411,
 		name:		"AMD7411",
-		init_setup:	init_setup_amd74xx,
 		init_chipset:	init_chipset_amd74xx,
 		init_iops:	NULL,
 		init_hwif:	init_hwif_amd74xx,
@@ -73,11 +69,10 @@ static ide_pci_device_t amd74xx_chipsets[] __initdata = {
 		enablebits:	{{0x40,0x01,0x01}, {0x40,0x02,0x02}},
 		bootable:	ON_BOARD,
 		extra:		0
-	},{
+	},{	/* 3 */
 		vendor:		PCI_VENDOR_ID_AMD,
 		device:		PCI_DEVICE_ID_AMD_OPUS_7441,
 		name:		"AMD7441",
-		init_setup:	init_setup_amd74xx,
 		init_chipset:	init_chipset_amd74xx,
 		init_iops:	NULL,
 		init_hwif:	init_hwif_amd74xx,
@@ -87,11 +82,10 @@ static ide_pci_device_t amd74xx_chipsets[] __initdata = {
 		enablebits:	{{0x40,0x01,0x01}, {0x40,0x02,0x02}},
 		bootable:	ON_BOARD,
 		extra:		0
-	},{
+	},{	/* 4 */
 		vendor:		PCI_VENDOR_ID_AMD,
 		device:		PCI_DEVICE_ID_AMD_8111_IDE,
 		name:		"AMD8111",
-		init_setup:	init_setup_amd74xx,
 		init_chipset:	init_chipset_amd74xx,
 		init_iops:	NULL,
 		init_hwif:	init_hwif_amd74xx,
diff --git a/drivers/ide/pci/cmd64x.c b/drivers/ide/pci/cmd64x.c
index de9f79095ced..5bfeaa97c18e 100644
--- a/drivers/ide/pci/cmd64x.c
+++ b/drivers/ide/pci/cmd64x.c
@@ -1,6 +1,6 @@
 /* $Id: cmd64x.c,v 1.21 2000/01/30 23:23:16
  *
- * linux/drivers/ide/cmd64x.c		Version 1.22	June 9, 2000
+ * linux/drivers/ide/cmd64x.c		Version 1.30	Sept 10, 2002
  *
  * cmd64x.c: Enable interrupts at initialization time on Ultra/PCI machines.
  *           Note, this driver is not used at all on other systems because
@@ -15,6 +15,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/module.h>
 #include <linux/types.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
@@ -727,10 +728,12 @@ static void __init init_hwif_cmd64x (ide_hwif_t *hwif)
 	if (dev->device == PCI_DEVICE_ID_CMD_643)
 		hwif->ultra_mask = 0x80;
 	if (dev->device == PCI_DEVICE_ID_CMD_646)
+	{
 		if (class_rev > 0x04)
 			hwif->ultra_mask = 0x07;
 		else
 			hwif->ultra_mask = 0x80;
+	}
 
 #ifdef CONFIG_BLK_DEV_IDEDMA
 	hwif->ide_dma_check = &cmd64x_config_drive_for_dma;
@@ -758,10 +761,6 @@ static void __init init_hwif_cmd64x (ide_hwif_t *hwif)
 #endif /* CONFIG_BLK_DEV_IDEDMA */
 }
 
-/**
- *	FIXME: not required ?
- */
- 
 static void __init init_dma_cmd64x (ide_hwif_t *hwif, unsigned long dmabase)
 {
 	ide_setup_dma(hwif, dmabase, 8);
@@ -769,30 +768,59 @@ static void __init init_dma_cmd64x (ide_hwif_t *hwif, unsigned long dmabase)
 
 extern void ide_setup_pci_device(struct pci_dev *, ide_pci_device_t *);
 
+static int __devinit cmd64x_init_one(struct pci_dev *dev, const struct pci_device_id *id)
+{
+	ide_pci_device_t *d = &cmd64x_chipsets[id->driver_data];
+	if (dev->device != d->device)
+		BUG();
+	ide_setup_pci_device(dev, d);
+	return 0;
+}
+
 /**
- *	FIXME: not required either ?
+ *	cmd64x_remove_one	-	called with an CMD64x is unplugged
+ *	@dev: the device that was removed
+ *
+ *	Disconnect a CMD64x device that has been unplugged either by hotplug
+ *	or by a more civilized notification scheme. Not yet supported.
  */
  
-static void __init init_setup_cmd64x (struct pci_dev *dev, ide_pci_device_t *d)
+static void cmd64x_remove_one(struct pci_dev *dev)
 {
-	ide_setup_pci_device(dev, d);
+	panic("CMD64x removal not yet supported");
 }
 
-int __init cmd64x_scan_pcidev (struct pci_dev *dev)
+static struct pci_device_id cmd64x_pci_tbl[] __devinitdata = {
+	{ PCI_VENDOR_ID_CMD, PCI_DEVICE_ID_CMD_643, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ PCI_VENDOR_ID_CMD, PCI_DEVICE_ID_CMD_646, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 1},
+	{ PCI_VENDOR_ID_CMD, PCI_DEVICE_ID_CMD_648, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 2},
+	{ PCI_VENDOR_ID_CMD, PCI_DEVICE_ID_CMD_649, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 3},
+	{ 0, },
+};
+
+static struct pci_driver driver = {
+	name:		"CMD64x IDE",
+	id_table:	cmd64x_pci_tbl,
+	probe:		cmd64x_init_one,
+	remove:		__devexit_p(cmd64x_remove_one),
+};
+
+static int cmd64x_ide_init(void)
 {
-	ide_pci_device_t *d;
-
-	if (dev->vendor != PCI_VENDOR_ID_CMD)
-		return 0;
+	return ide_pci_register_driver(&driver);
+}
 
-	for (d = cmd64x_chipsets; d && d->vendor && d->device; ++d) {
-		if (((d->vendor == dev->vendor) &&
-		     (d->device == dev->device)) &&
-		    (d->init_setup)) {
-			d->init_setup(dev, d);
-			return 1;
-		}
-	}
-	return 0;
+static void cmd64x_ide_exit(void)
+{
+	ide_pci_unregister_driver(&driver);
 }
 
+module_init(cmd64x_ide_init);
+module_exit(cmd64x_ide_exit);
+
+MODULE_AUTHOR("Eddie Dost, David Miller, Andre Hedrick");
+MODULE_DESCRIPTION("PCI driver module for CMD64x IDE");
+MODULE_LICENSE("GPL");
+
+EXPORT_NO_SYMBOLS;
+
diff --git a/drivers/ide/pci/cmd64x.h b/drivers/ide/pci/cmd64x.h
index bf679b96ff8d..b68f863a3057 100644
--- a/drivers/ide/pci/cmd64x.h
+++ b/drivers/ide/pci/cmd64x.h
@@ -79,17 +79,15 @@ static ide_pci_host_proc_t cmd64x_procs[] __initdata = {
 };
 #endif  /* defined(DISPLAY_CMD64X_TIMINGS) && defined(CONFIG_PROC_FS) */
 
-static void init_setup_cmd64x(struct pci_dev *, ide_pci_device_t *);
 static unsigned int init_chipset_cmd64x(struct pci_dev *, const char *);
 static void init_hwif_cmd64x(ide_hwif_t *);
 static void init_dma_cmd64x(ide_hwif_t *, unsigned long);
 
 static ide_pci_device_t cmd64x_chipsets[] __initdata = {
-	{
+	{	/* 0 */
 		vendor:		PCI_VENDOR_ID_CMD,
 		device:		PCI_DEVICE_ID_CMD_643,
 		name:		"CMD643",
-		init_setup:	init_setup_cmd64x,
 		init_chipset:	init_chipset_cmd64x,
 		init_iops:	NULL,
 		init_hwif:	init_hwif_cmd64x,
@@ -99,11 +97,10 @@ static ide_pci_device_t cmd64x_chipsets[] __initdata = {
 		enablebits:	{{0x00,0x00,0x00}, {0x00,0x00,0x00}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 1 */
 		vendor:		PCI_VENDOR_ID_CMD,
 		device:		PCI_DEVICE_ID_CMD_646,
 		name:		"CMD646",
-		init_setup:	init_setup_cmd64x,
 		init_chipset:	init_chipset_cmd64x,
 		init_iops:	NULL,
 		init_hwif:	init_hwif_cmd64x,
@@ -113,11 +110,10 @@ static ide_pci_device_t cmd64x_chipsets[] __initdata = {
 		enablebits:	{{0x00,0x00,0x00}, {0x51,0x80,0x80}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 2 */
 		vendor:		PCI_VENDOR_ID_CMD,
 		device:	PCI_DEVICE_ID_CMD_648,
 		name:		"CMD648",
-		init_setup:	init_setup_cmd64x,
 		init_chipset:	init_chipset_cmd64x,
 		init_iops:	NULL,
 		init_hwif:	init_hwif_cmd64x,
@@ -131,7 +127,6 @@ static ide_pci_device_t cmd64x_chipsets[] __initdata = {
 		vendor:		PCI_VENDOR_ID_CMD,
 		device:		PCI_DEVICE_ID_CMD_649,
 		name:		"CMD649",
-		init_setup:	init_setup_cmd64x,
 		init_chipset:	init_chipset_cmd64x,
 		init_iops:	NULL,
 		init_hwif:	init_hwif_cmd64x,
diff --git a/drivers/ide/pci/cs5530.c b/drivers/ide/pci/cs5530.c
index 323caf78326d..85d85e091b3c 100644
--- a/drivers/ide/pci/cs5530.c
+++ b/drivers/ide/pci/cs5530.c
@@ -1,5 +1,5 @@
 /*
- * linux/drivers/ide/cs5530.c		Version 0.6	Mar. 18, 2000
+ * linux/drivers/ide/cs5530.c		Version 0.7	Sept 10, 2002
  *
  * Copyright (C) 2000			Andre Hedrick <andre@linux-ide.org>
  * Ditto of GNU General Public License.
@@ -12,6 +12,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
@@ -419,44 +420,56 @@ static void __init init_dma_cs5530 (ide_hwif_t *hwif, unsigned long dmabase)
 
 extern void ide_setup_pci_device(struct pci_dev *, ide_pci_device_t *);
 
-/**
- *	init_setup_cs5530	-	set up a CS5530 IDE
- *	@dev: PCI device
- *	@d: PCI ide device info
- *
- *	FIXME: this function can go away too
- */
- 
-static void __init init_setup_cs5530 (struct pci_dev *dev, ide_pci_device_t *d)
+
+static int __devinit cs5530_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
+	ide_pci_device_t *d = &cs5530_chipsets[id->driver_data];
+	if (dev->device != d->device)
+		BUG();
 	ide_setup_pci_device(dev, d);
+	return 0;
 }
 
-
 /**
- *	cs5530_scan_pcidev	-	set up any CS5530 device
- *	@dev:	pci device to check
+ *	cs5530_remove_one	-	called when a CS5530 is unplugged
+ *	@dev: the device that was removed
  *
- *	Check if the device is a 5530 IDE controller. If it is then 
- *	claim and set up the interface.  Return 1 if we claimed the
- *	interface or zero if it is not ours
+ *	Disconnect an Cyrix device that has been unplugged either by hotplug
+ *	or by a more civilized notification scheme. Not yet supported.
  */
  
-int __init cs5530_scan_pcidev (struct pci_dev *dev)
+static void cs5530_remove_one(struct pci_dev *dev)
 {
-	ide_pci_device_t *d;
+	panic("Cyrix removal not yet supported");
+}
 
-	if (dev->vendor != PCI_VENDOR_ID_CYRIX)
-		return 0;
+static struct pci_device_id cs5530_pci_tbl[] __devinitdata = {
+	{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_IDE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ 0, },
+};
 
-	for (d = cs5530_chipsets; d && d->vendor && d->device; ++d) {
-		if (((d->vendor == dev->vendor) &&
-		     (d->device == dev->device)) &&
-		    (d->init_setup)) {
-			d->init_setup(dev, d);
-			return 1;
-		}
-	}
-	return 0;
+static struct pci_driver driver = {
+	name:		"CS5530 IDE",
+	id_table:	cs5530_pci_tbl,
+	probe:		cs5530_init_one,
+	remove:		__devexit_p(cs5530_remove_one),
+};
+
+static int cs5530_ide_init(void)
+{
+	return ide_pci_register_driver(&driver);
 }
 
+static void cs5530_ide_exit(void)
+{
+	ide_pci_unregister_driver(&driver);
+}
+
+module_init(cs5530_ide_init);
+module_exit(cs5530_ide_exit);
+
+MODULE_AUTHOR("Mark Lord");
+MODULE_DESCRIPTION("PCI driver module for Cyrix/NS 5530 IDE");
+MODULE_LICENSE("GPL");
+
+EXPORT_NO_SYMBOLS;
diff --git a/drivers/ide/pci/cs5530.h b/drivers/ide/pci/cs5530.h
index 7d22dda909e5..b68f73392117 100644
--- a/drivers/ide/pci/cs5530.h
+++ b/drivers/ide/pci/cs5530.h
@@ -25,17 +25,15 @@ static ide_pci_host_proc_t cs5530_procs[] __initdata = {
 };
 #endif /* DISPLAY_CS5530_TIMINGS && CONFIG_PROC_FS */
 
-static void init_setup_cs5530(struct pci_dev *, ide_pci_device_t *);
 static unsigned int init_chipset_cs5530(struct pci_dev *, const char *);
 static void init_hwif_cs5530(ide_hwif_t *);
 static void init_dma_cs5530(ide_hwif_t *, unsigned long);
 
 static ide_pci_device_t cs5530_chipsets[] __initdata = {
-	{
+	{	/* 0 */
 		vendor:		PCI_VENDOR_ID_CYRIX,
 		device:		PCI_DEVICE_ID_CYRIX_5530_IDE,
 		name:		"CS5530",
-		init_setup:	init_setup_cs5530,
 		init_chipset:	init_chipset_cs5530,
 		init_iops:	NULL,
 		init_hwif:	init_hwif_cs5530,
diff --git a/drivers/ide/pci/cy82c693.c b/drivers/ide/pci/cy82c693.c
index c359a72c3ef1..6292a715b4fb 100644
--- a/drivers/ide/pci/cy82c693.c
+++ b/drivers/ide/pci/cy82c693.c
@@ -1,5 +1,5 @@
 /*
- * linux/drivers/ide/cy82c693.c		Version 0.34	Dec. 13, 1999
+ * linux/drivers/ide/cy82c693.c		Version 0.40	Sep. 10, 2002
  *
  *  Copyright (C) 1998-2000 Andreas S. Krebs (akrebs@altavista.net), Maintainer
  *  Copyright (C) 1998-2002 Andre Hedrick <andre@linux-ide.org>, Integrater
@@ -29,8 +29,7 @@
  * - first tests with DMA look okay, they seem to work, but there is a
  *   problem with sound - the BusMaster IDE TimeOut should fixed this
  *
- *
- * History:
+ * Ancient History:
  * AMH@1999-08-24: v0.34 init_cy82c693_chip moved to pci_init_cy82c693
  * ASK@1999-01-23: v0.33 made a few minor code clean ups
  *                       removed DMA clock speed setting by default
@@ -46,6 +45,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/module.h>
 #include <linux/types.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
@@ -418,29 +418,56 @@ void __init init_dma_cy82c693 (ide_hwif_t *hwif, unsigned long dmabase)
 
 extern void ide_setup_pci_device(struct pci_dev *, ide_pci_device_t *);
 
-void __init init_setup_cy82c693 (struct pci_dev *dev, ide_pci_device_t *d)
+static int __devinit cy82c693_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
+	ide_pci_device_t *d = &cy82c693_chipsets[id->driver_data];
         if ((!(PCI_FUNC(dev->devfn) & 1) ||
 	    (!((dev->class >> 8) == PCI_CLASS_STORAGE_IDE))))
-		return;	/* CY82C693 is more than only a IDE controller */
+		return 0;	/* CY82C693 is more than only a IDE controller */
 	ide_setup_pci_device(dev, d);
+	return 0;
 }
 
-int __init cy82c693_scan_pcidev (struct pci_dev *dev)
+/**
+ *	cy82c693_remove_one	-	called with an Cypress is unplugged
+ *	@dev: the device that was removed
+ *
+ *	Disconnect an Cypress device that has been unplugged either by hotplug
+ *	or by a more civilized notification scheme. Not yet supported.
+ */
+ 
+static void cy82c693_remove_one(struct pci_dev *dev)
 {
-	ide_pci_device_t *d;
+	panic("Cypress removal not yet supported");
+}
 
-	if (dev->vendor != PCI_VENDOR_ID_CONTAQ)
-		return 0;
+static struct pci_device_id cy82c693_pci_tbl[] __devinitdata = {
+	{ PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_IDE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ 0, },
+};
 
-	for (d = cy82c693_chipsets; d && d->vendor && d->device; ++d) {
-		if (((d->vendor == dev->vendor) &&
-		     (d->device == dev->device)) &&
-		    (d->init_setup)) {
-			d->init_setup(dev, d);
-			return 1;
-		}
-	}
-	return 0;
+static struct pci_driver driver = {
+	name:		"Cypress IDE",
+	id_table:	cy82c693_pci_tbl,
+	probe:		cy82c693_init_one,
+	remove:		__devexit_p(cy82c693_remove_one),
+};
+
+static int cy82c693_ide_init(void)
+{
+	return ide_pci_register_driver(&driver);
 }
 
+static void cy82c693_ide_exit(void)
+{
+	ide_pci_unregister_driver(&driver);
+}
+
+module_init(cy82c693_ide_init);
+module_exit(cy82c693_ide_exit);
+
+MODULE_AUTHOR("Andreas Krebs, Andre Hedrick");
+MODULE_DESCRIPTION("PCI driver module for the Cypress CY82C693 IDE");
+MODULE_LICENSE("GPL");
+
+EXPORT_NO_SYMBOLS;
diff --git a/drivers/ide/pci/cy82c693.h b/drivers/ide/pci/cy82c693.h
index 442978970be1..b5c6f9652d51 100644
--- a/drivers/ide/pci/cy82c693.h
+++ b/drivers/ide/pci/cy82c693.h
@@ -64,17 +64,15 @@ typedef struct pio_clocks_s {
 	u8	time_8;		/* clocks for 8bit (0xF0=Active/data, 0x0F=Recovery) */
 } pio_clocks_t;
 
-extern void init_setup_cy82c693(struct pci_dev *, ide_pci_device_t *);
 extern unsigned int init_chipset_cy82c693(struct pci_dev *, const char *);
 extern void init_hwif_cy82c693(ide_hwif_t *);
 extern void init_dma_cy82c693(ide_hwif_t *, unsigned long);
 
 static ide_pci_device_t cy82c693_chipsets[] __initdata = {
-	{
+	{	/* 0 */
 		vendor:		PCI_VENDOR_ID_CONTAQ,
 		device:		PCI_DEVICE_ID_CONTAQ_82C693,
 		name:		"CY82C693",
-		init_setup:	init_setup_cy82c693,
 		init_chipset:	init_chipset_cy82c693,
 		init_iops:	NULL,
 		init_hwif:	init_hwif_cy82c693,
diff --git a/drivers/ide/pci/generic.c b/drivers/ide/pci/generic.c
index da04d093478d..7de03692f9ff 100644
--- a/drivers/ide/pci/generic.c
+++ b/drivers/ide/pci/generic.c
@@ -1,5 +1,5 @@
 /*
- *  linux/drivers/ide/pci-orphan.c	Version 0.01	December 8, 1997
+ *  linux/drivers/ide/generic.c		Version 0.10	Sept 11, 2002
  *
  *  Copyright (C) 2001-2002	Andre Hedrick <andre@linux-ide.org>
  */
@@ -9,6 +9,7 @@
 
 #include <linux/config.h> /* for CONFIG_BLK_DEV_IDEPCI */
 #include <linux/types.h>
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
 #include <linux/timer.h>
@@ -84,19 +85,10 @@ static void __init init_setup_unknown (struct pci_dev *dev, ide_pci_device_t *d)
 	ide_setup_pci_device(dev, d);
 }
 
-int __init generic_scan_pcidev (struct pci_dev *dev)
-{
-	ide_pci_device_t *d;
-
-	for (d = generic_chipsets; d && d->vendor && d->device; ++d) {
-		if (((d->vendor == dev->vendor) &&
-		     (d->device == dev->device)) &&
-		    (d->init_setup)) {
-			d->init_setup(dev, d);
-			return 1;
-		}
-	}
+#if 0
 
+	/* Logic to add back later on */
+	
 	if ((dev->class >> 8) == PCI_CLASS_STORAGE_IDE) {
 		ide_pci_device_t *unknown = unknown_chipset;
 //		unknown->vendor = dev->vendor;
@@ -105,5 +97,75 @@ int __init generic_scan_pcidev (struct pci_dev *dev)
 		return 1;
 	}
 	return 0;
+#endif	
+
+/**
+ *	generic_init_one	-	called when a PIIX is found
+ *	@dev: the generic device
+ *	@id: the matching pci id
+ *
+ *	Called when the PCI registration layer (or the IDE initialization)
+ *	finds a device matching our IDE device tables.
+ */
+ 
+static int __devinit generic_init_one(struct pci_dev *dev, const struct pci_device_id *id)
+{
+	ide_pci_device_t *d = &generic_chipsets[id->driver_data];
+
+	if (dev->device != d->device)
+		BUG();
+	d->init_setup(dev, d);
+	return 0;
 }
 
+/**
+ *	generic_remove_one	-	called when PCI IDE is unplugged
+ *	@dev: the device that was removed
+ *
+ *	Disconnect an IDE device that has been unplugged either by hotplug
+ *	or by a more civilized notification scheme. Not yet supported.
+ */
+ 
+static void generic_remove_one(struct pci_dev *dev)
+{
+	panic("PCI IDE removal not yet supported");
+}
+
+static struct pci_device_id generic_pci_tbl[] __devinitdata = {
+	{ PCI_VENDOR_ID_NS,     PCI_DEVICE_ID_NS_87410,            PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ PCI_VENDOR_ID_PCTECH, PCI_DEVICE_ID_PCTECH_SAMURAI_IDE,  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 1},
+	{ PCI_VENDOR_ID_HOLTEK, PCI_DEVICE_ID_HOLTEK_6565,         PCI_ANY_ID, PCI_ANY_ID, 0, 0, 2},
+	{ PCI_VENDOR_ID_UMC,    PCI_DEVICE_ID_UMC_UM8673F,         PCI_ANY_ID, PCI_ANY_ID, 0, 0, 3},
+	{ PCI_VENDOR_ID_UMC,    PCI_DEVICE_ID_UMC_UM8886A,         PCI_ANY_ID, PCI_ANY_ID, 0, 0, 4},
+	{ PCI_VENDOR_ID_UMC,    PCI_DEVICE_ID_UMC_UM8886BF,        PCI_ANY_ID, PCI_ANY_ID, 0, 0, 5},
+	{ PCI_VENDOR_ID_HINT,   PCI_DEVICE_ID_HINT_VXPROII_IDE,    PCI_ANY_ID, PCI_ANY_ID, 0, 0, 6},
+	{ PCI_VENDOR_ID_VIA,    PCI_DEVICE_ID_VIA_82C561,          PCI_ANY_ID, PCI_ANY_ID, 0, 0, 7},
+	{ PCI_VENDOR_ID_OPTI,   PCI_DEVICE_ID_OPTI_82C558,         PCI_ANY_ID, PCI_ANY_ID, 0, 0, 8},
+	{ 0, },
+};
+
+static struct pci_driver driver = {
+	name:		"PCI IDE",
+	id_table:	generic_pci_tbl,
+	probe:		generic_init_one,
+	remove:		__devexit_p(generic_remove_one),
+};
+
+static int generic_ide_init(void)
+{
+	return ide_pci_register_driver(&driver);
+}
+
+static void generic_ide_exit(void)
+{
+	ide_pci_unregister_driver(&driver);
+}
+
+module_init(generic_ide_init);
+module_exit(generic_ide_exit);
+
+MODULE_AUTHOR("Andre Hedrick");
+MODULE_DESCRIPTION("PCI driver module for generic PCI IDE");
+MODULE_LICENSE("GPL");
+
+EXPORT_NO_SYMBOLS;
diff --git a/drivers/ide/pci/generic.h b/drivers/ide/pci/generic.h
index ba58c7bd4067..ea594b5ebdfb 100644
--- a/drivers/ide/pci/generic.h
+++ b/drivers/ide/pci/generic.h
@@ -11,7 +11,7 @@ static void init_hwif_generic(ide_hwif_t *);
 static void init_dma_generic(ide_hwif_t *, unsigned long);
 
 static ide_pci_device_t generic_chipsets[] __initdata = {
-	{
+	{	/* 0 */
 		vendor:		PCI_VENDOR_ID_NS,
 		device:		PCI_DEVICE_ID_NS_87410,
 		name:		"NS87410",
@@ -25,7 +25,7 @@ static ide_pci_device_t generic_chipsets[] __initdata = {
 		enablebits:	{{0x43,0x08,0x08}, {0x47,0x08,0x08}},
 		bootable:	ON_BOARD,
 		extra:		0,
-        },{
+        },{	/* 1 */
 		vendor:		PCI_VENDOR_ID_PCTECH,
 		device:		PCI_DEVICE_ID_PCTECH_SAMURAI_IDE,
 		name:		"SAMURAI",
@@ -39,7 +39,7 @@ static ide_pci_device_t generic_chipsets[] __initdata = {
 		enablebits:	{{0x00,0x00,0x00}, {0x00,0x00,0x00}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 2 */
 		vendor:		PCI_VENDOR_ID_HOLTEK,
 		device:		PCI_DEVICE_ID_HOLTEK_6565,
 		name:		"HT6565",
@@ -53,7 +53,7 @@ static ide_pci_device_t generic_chipsets[] __initdata = {
 		enablebits:	{{0x00,0x00,0x00}, {0x00,0x00,0x00}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 3 */
 		vendor:		PCI_VENDOR_ID_UMC,
 		device:		PCI_DEVICE_ID_UMC_UM8673F,
 		name:		"UM8673F",
@@ -67,7 +67,7 @@ static ide_pci_device_t generic_chipsets[] __initdata = {
 		enablebits:	{{0x00,0x00,0x00}, {0x00,0x00,0x00}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 4 */
 		vendor:		PCI_VENDOR_ID_UMC,
 		device:		PCI_DEVICE_ID_UMC_UM8886A,
 		name:		"UM8886A",
@@ -81,7 +81,7 @@ static ide_pci_device_t generic_chipsets[] __initdata = {
 		enablebits:	{{0x00,0x00,0x00}, {0x00,0x00,0x00}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 5 */
 		vendor:		PCI_VENDOR_ID_UMC,
 		device:		PCI_DEVICE_ID_UMC_UM8886BF,
 		name:		"UM8886BF",
@@ -95,7 +95,7 @@ static ide_pci_device_t generic_chipsets[] __initdata = {
 		enablebits:	{{0x00,0x00,0x00}, {0x00,0x00,0x00}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 6 */
 		vendor:		PCI_VENDOR_ID_HINT,
 		device:		PCI_DEVICE_ID_HINT_VXPROII_IDE,
 		name:		"HINT_IDE",
@@ -109,7 +109,7 @@ static ide_pci_device_t generic_chipsets[] __initdata = {
 		enablebits:	{{0x00,0x00,0x00}, {0x00,0x00,0x00}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 7 */
 		vendor:		PCI_VENDOR_ID_VIA,
 		device:		PCI_DEVICE_ID_VIA_82C561,
 		name:		"VIA_IDE",
@@ -123,7 +123,7 @@ static ide_pci_device_t generic_chipsets[] __initdata = {
 		enablebits:	{{0x00,0x00,0x00}, {0x00,0x00,0x00}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 8 */
 		vendor:		PCI_VENDOR_ID_OPTI,
 		device:		PCI_DEVICE_ID_OPTI_82C558,
 		name:		"OPTI621V",
@@ -146,7 +146,7 @@ static ide_pci_device_t generic_chipsets[] __initdata = {
 };
 
 static ide_pci_device_t unknown_chipset[] __initdata = {
-	{
+	{	/* 0 */
 		vendor:		0,
 		device:		0,
 		name:		"PCI_IDE",
diff --git a/drivers/ide/pci/hpt34x.c b/drivers/ide/pci/hpt34x.c
index 18b7a9646016..490b6d3cb9af 100644
--- a/drivers/ide/pci/hpt34x.c
+++ b/drivers/ide/pci/hpt34x.c
@@ -1,5 +1,5 @@
 /*
- * linux/drivers/ide/hpt34x.c		Version 0.31	June. 9, 2000
+ * linux/drivers/ide/hpt34x.c		Version 0.40	Sept 10, 2002
  *
  * Copyright (C) 1998-2000	Andre Hedrick <andre@linux-ide.org>
  * May be copied or modified under the terms of the GNU General Public License
@@ -25,6 +25,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
@@ -321,34 +322,61 @@ static void __init init_dma_hpt34x (ide_hwif_t *hwif, unsigned long dmabase)
 
 extern void ide_setup_pci_device(struct pci_dev *, ide_pci_device_t *);
 
-static void __init init_setup_hpt34x (struct pci_dev *dev, ide_pci_device_t *d)
+static int __devinit hpt34x_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
-	char *chipset_names[] = {"HPT343", "HPT345"};
+	ide_pci_device_t *d = &hpt34x_chipsets[id->driver_data];
+	static char *chipset_names[] = {"HPT343", "HPT345"};
 	u16 pcicmd = 0;
 
 	pci_read_config_word(dev, PCI_COMMAND, &pcicmd);
 
-	strcpy(d->name, chipset_names[(pcicmd & PCI_COMMAND_MEMORY) ? 1 : 0]);
+	d->name = chipset_names[(pcicmd & PCI_COMMAND_MEMORY) ? 1 : 0];
 	d->bootable = (pcicmd & PCI_COMMAND_MEMORY) ? OFF_BOARD : NEVER_BOARD;
 
 	ide_setup_pci_device(dev, d);
+	return 0;
 }
 
-int  __init hpt34x_scan_pcidev (struct pci_dev *dev)
+/**
+ *	hpt34x_remove_one	-	called with an hpt34x is unplugged
+ *	@dev: the device that was removed
+ *
+ *	Disconnect an hpt34x device that has been unplugged either by hotplug
+ *	or by a more civilized notification scheme. Not yet supported.
+ */
+ 
+static void hpt34x_remove_one(struct pci_dev *dev)
 {
-	ide_pci_device_t *d;
+	panic("hpt34x removal not yet supported");
+}
 
-	if (dev->vendor != PCI_VENDOR_ID_TTI)
-		return 0;
+static struct pci_device_id hpt34x_pci_tbl[] __devinitdata = {
+	{ PCI_VENDOR_ID_TTI, PCI_DEVICE_ID_TTI_HPT343, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ 0, },
+};
 
-	for (d = hpt34x_chipsets; d && d->vendor && d->device; ++d) {
-		if (((d->vendor == dev->vendor) &&
-		     (d->device == dev->device)) &&
-		    (d->init_setup)) {
-			d->init_setup(dev, d);
-			return 1;
-		}
-	}
-	return 0;
+static struct pci_driver driver = {
+	name:		"HPT34x IDE",
+	id_table:	hpt34x_pci_tbl,
+	probe:		hpt34x_init_one,
+	remove:		__devexit_p(hpt34x_remove_one),
+};
+
+static int hpt34x_ide_init(void)
+{
+	return ide_pci_register_driver(&driver);
 }
 
+static void hpt34x_ide_exit(void)
+{
+	ide_pci_unregister_driver(&driver);
+}
+
+module_init(hpt34x_ide_init);
+module_exit(hpt34x_ide_exit);
+
+MODULE_AUTHOR("Andre Hedrick");
+MODULE_DESCRIPTION("PCI driver module for Highpoint 34x IDE");
+MODULE_LICENSE("GPL");
+
+EXPORT_NO_SYMBOLS;
diff --git a/drivers/ide/pci/hpt34x.h b/drivers/ide/pci/hpt34x.h
index d50363967604..941092974851 100644
--- a/drivers/ide/pci/hpt34x.h
+++ b/drivers/ide/pci/hpt34x.h
@@ -31,17 +31,15 @@ static ide_pci_host_proc_t hpt34x_procs[] __initdata = {
 };
 #endif  /* defined(DISPLAY_HPT34X_TIMINGS) && defined(CONFIG_PROC_FS) */
 
-static void init_setup_hpt34x(struct pci_dev *, ide_pci_device_t *);
 static unsigned int init_chipset_hpt34x(struct pci_dev *, const char *);
 static void init_hwif_hpt34x(ide_hwif_t *);
 static void init_dma_hpt34x(ide_hwif_t *, unsigned long);
 
 static ide_pci_device_t hpt34x_chipsets[] __initdata = {
-	{
+	{	/* 0 */
 		vendor:		PCI_VENDOR_ID_TTI,
 		device:		PCI_DEVICE_ID_TTI_HPT343,
 		name:		"HPT34X",
-		init_setup:	init_setup_hpt34x,
 		init_chipset:	init_chipset_hpt34x,
 		init_iops:	NULL,
 		init_hwif:	init_hwif_hpt34x,
diff --git a/drivers/ide/pci/hpt366.c b/drivers/ide/pci/hpt366.c
index 145123019e5d..da8d6548771f 100644
--- a/drivers/ide/pci/hpt366.c
+++ b/drivers/ide/pci/hpt366.c
@@ -44,6 +44,7 @@
 
 #include <linux/config.h>
 #include <linux/types.h>
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
 #include <linux/timer.h>
@@ -1167,23 +1168,70 @@ static void __init init_setup_hpt366 (struct pci_dev *dev, ide_pci_device_t *d)
 	ide_setup_pci_device(dev, d);
 }
 
-int __init hpt366_scan_pcidev (struct pci_dev *dev)
-{
-	ide_pci_device_t *d;
 
-	if (dev->vendor != PCI_VENDOR_ID_TTI)
-		return 0;
-	if (dev->device == PCI_DEVICE_ID_TTI_HPT343)
-		return 0;
+/**
+ *	hpt366_init_one	-	called when an HPT366 is found
+ *	@dev: the hpt366 device
+ *	@id: the matching pci id
+ *
+ *	Called when the PCI registration layer (or the IDE initialization)
+ *	finds a device matching our IDE device tables.
+ */
+ 
+static int __devinit hpt366_init_one(struct pci_dev *dev, const struct pci_device_id *id)
+{
+	ide_pci_device_t *d = &hpt366_chipsets[id->driver_data];
 
-	for (d = hpt366_chipsets; d && d->vendor && d->device; ++d) {
-		if (((d->vendor == dev->vendor) &&
-		     (d->device == dev->device)) &&
-		    (d->init_setup)) {
-			d->init_setup(dev, d);
-			return 1;
-		}
-	}
+	if (dev->device != d->device)
+		BUG();
+	d->init_setup(dev, d);
 	return 0;
 }
 
+/**
+ *	hpt366_remove_one	-	called when an HPT366 is unplugged
+ *	@dev: the device that was removed
+ *
+ *	Disconnect a HPT366 device that has been unplugged either by hotplug
+ *	or by a more civilized notification scheme. Not yet supported.
+ */
+ 
+static void hpt366_remove_one(struct pci_dev *dev)
+{
+	panic("HPT366 removal not yet supported");
+}
+
+static struct pci_device_id hpt366_pci_tbl[] __devinitdata = {
+	{ PCI_VENDOR_ID_TTI, PCI_DEVICE_ID_TTI_HPT366, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ PCI_VENDOR_ID_TTI, PCI_DEVICE_ID_TTI_HPT372, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 1},
+	{ PCI_VENDOR_ID_TTI, PCI_DEVICE_ID_TTI_HPT302, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 2},
+	{ PCI_VENDOR_ID_TTI, PCI_DEVICE_ID_TTI_HPT371, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 3},
+	{ PCI_VENDOR_ID_TTI, PCI_DEVICE_ID_TTI_HPT374, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 15},
+	{ 0, },
+};
+
+static struct pci_driver driver = {
+	name:		"HPT366 IDE",
+	id_table:	hpt366_pci_tbl,
+	probe:		hpt366_init_one,
+	remove:		__devexit_p(hpt366_remove_one),
+};
+
+static int hpt366_ide_init(void)
+{
+	return ide_pci_register_driver(&driver);
+}
+
+static void hpt366_ide_exit(void)
+{
+	ide_pci_unregister_driver(&driver);
+}
+
+module_init(hpt366_ide_init);
+module_exit(hpt366_ide_exit);
+
+MODULE_AUTHOR("Andre Hedrick");
+MODULE_DESCRIPTION("PCI driver module for Highpoint HPT366 IDE");
+MODULE_LICENSE("GPL");
+
+EXPORT_NO_SYMBOLS;
diff --git a/drivers/ide/pci/hpt366.h b/drivers/ide/pci/hpt366.h
index 196d2e8aaa6c..a993b0a75275 100644
--- a/drivers/ide/pci/hpt366.h
+++ b/drivers/ide/pci/hpt366.h
@@ -442,7 +442,7 @@ static void init_hwif_hpt366(ide_hwif_t *);
 static void init_dma_hpt366(ide_hwif_t *, unsigned long);
 
 static ide_pci_device_t hpt366_chipsets[] __initdata = {
-	{
+	{	/* 0 */
 		vendor:		PCI_VENDOR_ID_TTI,
 		device:		PCI_DEVICE_ID_TTI_HPT366,
 		name:		"HPT366",
@@ -456,7 +456,7 @@ static ide_pci_device_t hpt366_chipsets[] __initdata = {
 		enablebits:	{{0x00,0x00,0x00}, {0x00,0x00,0x00}},
 		bootable:	OFF_BOARD,
 		extra:		240
-	},{
+	},{	/* 1 */
 		vendor:		PCI_VENDOR_ID_TTI,
 		device:		PCI_DEVICE_ID_TTI_HPT372,
 		name:		"HPT372A",
@@ -470,7 +470,7 @@ static ide_pci_device_t hpt366_chipsets[] __initdata = {
 		enablebits:	{{0x00,0x00,0x00}, {0x00,0x00,0x00}},
 		bootable:	OFF_BOARD,
 		extra:		0
-	},{
+	},{	/* 2 */
 		vendor:		PCI_VENDOR_ID_TTI,
 		device:		PCI_DEVICE_ID_TTI_HPT302,
 		name:		"HPT302",
@@ -484,7 +484,7 @@ static ide_pci_device_t hpt366_chipsets[] __initdata = {
 		enablebits:	{{0x00,0x00,0x00}, {0x00,0x00,0x00}},
 		bootable:	OFF_BOARD,
 		extra:		0
-	},{
+	},{	/* 3 */
 		vendor:		PCI_VENDOR_ID_TTI,
 		device:		PCI_DEVICE_ID_TTI_HPT371,
 		name:		"HPT371",
@@ -498,7 +498,7 @@ static ide_pci_device_t hpt366_chipsets[] __initdata = {
 		enablebits:	{{0x00,0x00,0x00}, {0x00,0x00,0x00}},
 		bootable:	OFF_BOARD,
 		extra:		0
-	},{
+	},{	/* 4 */
 		vendor:		PCI_VENDOR_ID_TTI,
 		device:		PCI_DEVICE_ID_TTI_HPT374,
 		name:		"HPT374",
diff --git a/drivers/ide/pci/it8172.c b/drivers/ide/pci/it8172.c
index 8655e2411919..07a3d0080fb2 100644
--- a/drivers/ide/pci/it8172.c
+++ b/drivers/ide/pci/it8172.c
@@ -29,6 +29,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/ioport.h>
@@ -297,29 +298,56 @@ static void __init init_dma_it8172 (ide_hwif_t *hwif, unsigned long dmabase)
 
 extern void ide_setup_pci_device(struct pci_dev *, ide_pci_device_t *);
 
-static void __init init_setup_it8172 (struct pci_dev *dev, ide_pci_device_t *d)
+static int __devinit it8172_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
+	ide_pci_device_t *d = &it8172_chipsets[id->driver_data];
         if ((!(PCI_FUNC(dev->devfn) & 1) ||
             (!((dev->class >> 8) == PCI_CLASS_STORAGE_IDE))))
-                return; /* IT8172 is more than only a IDE controller */
+                return 0; /* IT8172 is more than only a IDE controller */
 	ide_setup_pci_device(dev, d);
+	return 0;
 }
 
-static int __init it8172_scan_pcidev (struct pci_dev *dev)
+/**
+ *	it8172_remove_one	-	called with an IT8172 is unplugged
+ *	@dev: the device that was removed
+ *
+ *	Disconnect an IT8172 device that has been unplugged either by hotplug
+ *	or by a more civilized notification scheme. Not yet supported.
+ */
+ 
+static void it8172_remove_one(struct pci_dev *dev)
 {
-	ide_pci_device_t *d;
+	panic("IT8172 removal not yet supported");
+}
 
-	if (dev->vendor != PCI_VENDOR_ID_ITE)
-		return 0;
+static struct pci_device_id it8172_pci_tbl[] __devinitdata = {
+	{ PCI_VENDOR_ID_ITE, PCI_DEVICE_ID_ITE_IT8172G, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ 0, },
+};
 
-	for (d = it8172_chipsets; d && d->vendor && d->device; ++d) {
-		if (((d->vendor == dev->vendor) &&
-		     (d->device == dev->device)) &&
-		    (d->init_setup)) {
-			d->init_setup(dev, d);
-			return 1;
-		}
-	}
-	return 0;
+static struct pci_driver driver = {
+	name:		"IT8172IDE",
+	id_table:	it8172_pci_tbl,
+	probe:		it8172_init_one,
+	remove:		__devexit_p(it8172_remove_one),
+};
+
+static int it8172_ide_init(void)
+{
+	return ide_pci_register_driver(&driver);
 }
 
+static void it8172_ide_exit(void)
+{
+	ide_pci_unregister_driver(&driver);
+}
+
+module_init(it8172_ide_init);
+module_exit(it8172_ide_exit);
+
+MODULE_AUTHOR("SteveL@mvista.com");
+MODULE_DESCRIPTION("PCI driver module for ITE 8172 IDE");
+MODULE_LICENSE("GPL");
+
+EXPORT_NO_SYMBOLS;
diff --git a/drivers/ide/pci/it8172.h b/drivers/ide/pci/it8172.h
index 40d107a9b366..555bde0bd3e1 100644
--- a/drivers/ide/pci/it8172.h
+++ b/drivers/ide/pci/it8172.h
@@ -20,7 +20,7 @@ static void init_hwif_it8172(ide_hwif_t *);
 static void init_dma_it8172(ide_hwif_t *, unsigned long);
 
 static ide_pci_device_t it8172_chipsets[] __initdata = {
-	{
+	{	/* 0 */
 		vendor:		PCI_VENDOR_ID_ITE,
 		device:		PCI_DEVICE_ID_ITE_IT8172G,
 		name:		"IT8172G",
diff --git a/drivers/ide/pci/ns87415.c b/drivers/ide/pci/ns87415.c
index 02e34b1acf0c..b113158a9937 100644
--- a/drivers/ide/pci/ns87415.c
+++ b/drivers/ide/pci/ns87415.c
@@ -1,5 +1,5 @@
 /*
- * linux/drivers/ide/ns87415.c		Version 1.01  Mar. 18, 2000
+ * linux/drivers/ide/ns87415.c		Version 2.00  Sep. 10, 2002
  *
  * Copyright (C) 1997-1998	Mark Lord <mlord@pobox.com>
  * Copyright (C) 1998		Eddie C. Dost <ecd@skynet.be>
@@ -9,6 +9,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/timer.h>
@@ -229,26 +230,55 @@ static void __init init_dma_ns87415 (ide_hwif_t *hwif, unsigned long dmabase)
 
 extern void ide_setup_pci_device(struct pci_dev *, ide_pci_device_t *);
 
-static void __init init_setup_ns87415 (struct pci_dev *dev, ide_pci_device_t *d)
+static int __devinit ns87415_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
+	ide_pci_device_t *d = &ns87415_chipsets[id->driver_data];
+	if (dev->device != d->device)
+		BUG();
 	ide_setup_pci_device(dev, d);
+	return 0;
 }
 
-int __init ns87415_scan_pcidev (struct pci_dev *dev)
+/**
+ *	ns87415_remove_one	-	called with an NS87415 is unplugged
+ *	@dev: the device that was removed
+ *
+ *	Disconnect an NS87415 device that has been unplugged either by hotplug
+ *	or by a more civilized notification scheme. Not yet supported.
+ */
+ 
+static void ns87415_remove_one(struct pci_dev *dev)
 {
-	ide_pci_device_t *d;
+	panic("NS87415 removal not yet supported");
+}
 
-	if (dev->vendor != PCI_VENDOR_ID_NS)
-		return 0;
+static struct pci_device_id ns87415_pci_tbl[] __devinitdata = {
+	{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_87415, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ 0, },
+};
 
-	for (d = ns87415_chipsets; d && d->vendor && d->device; ++d) {
-		if (((d->vendor == dev->vendor) &&
-		     (d->device == dev->device)) &&
-		    (d->init_setup)) {
-			d->init_setup(dev, d);
-			return 1;
-		}
-	}
-	return 0;
+static struct pci_driver driver = {
+	name:		"NS87415IDE",
+	id_table:	ns87415_pci_tbl,
+	probe:		ns87415_init_one,
+	remove:		__devexit_p(ns87415_remove_one),
+};
+
+static int ns87415_ide_init(void)
+{
+	return ide_pci_register_driver(&driver);
 }
 
+static void ns87415_ide_exit(void)
+{
+	ide_pci_unregister_driver(&driver);
+}
+
+module_init(ns87415_ide_init);
+module_exit(ns87415_ide_exit);
+
+MODULE_AUTHOR("Mark Lord, Eddie Dost, Andre Hedrick");
+MODULE_DESCRIPTION("PCI driver module for NS87415 IDE");
+MODULE_LICENSE("GPL");
+
+EXPORT_NO_SYMBOLS;
diff --git a/drivers/ide/pci/ns87415.h b/drivers/ide/pci/ns87415.h
index 24519cbd5810..636481478a92 100644
--- a/drivers/ide/pci/ns87415.h
+++ b/drivers/ide/pci/ns87415.h
@@ -5,16 +5,14 @@
 #include <linux/pci.h>
 #include <linux/ide.h>
 
-static void init_setup_ns87415(struct pci_dev *, ide_pci_device_t *);
 static void init_hwif_ns87415(ide_hwif_t *);
 static void init_dma_ns87415(ide_hwif_t *, unsigned long);
 
 static ide_pci_device_t ns87415_chipsets[] __initdata = {
-	{
+	{	/* 0 */
 		vendor:		PCI_VENDOR_ID_NS,
 		device:		PCI_DEVICE_ID_NS_87415,
 		name:		"NS87415",
-		init_setup:	init_setup_ns87415,
 		init_chipset:	NULL,
 		init_iops:	NULL,
 		init_hwif:	init_hwif_ns87415,
diff --git a/drivers/ide/pci/nvidia.c b/drivers/ide/pci/nvidia.c
index 3907830d40fa..1c073eba731d 100644
--- a/drivers/ide/pci/nvidia.c
+++ b/drivers/ide/pci/nvidia.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
@@ -91,8 +92,8 @@ static u8 nforce_ratemask (ide_drive_t *drive)
  */
 static int nforce_tune_chipset (ide_drive_t *drive, u8 xferspeed)
 {
-	u8 drive_pci[]		= { 0x63, 0x62, 0x61, 0x60 };
-	u8 drive_pci2[]		= { 0x5b, 0x5a, 0x59, 0x58 };
+	static const u8 drive_pci[] = { 0x63, 0x62, 0x61, 0x60 };
+	static const u8 drive_pci2[] = { 0x5b, 0x5a, 0x59, 0x58 };
 
 	ide_hwif_t *hwif	= HWIF(drive);
 	struct pci_dev *dev	= hwif->pci_dev;
@@ -336,27 +337,55 @@ static void __init init_dma_nforce (ide_hwif_t *hwif, unsigned long dmabase)
 
 extern void ide_setup_pci_device(struct pci_dev *, ide_pci_device_t *);
 
-/* FIXME - not needed */
-static void __init init_setup_nforce (struct pci_dev *dev, ide_pci_device_t *d)
+static int __devinit nforce_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
+	ide_pci_device_t *d = &nvidia_chipsets[id->driver_data];
+	if (dev->device != d->device)
+		BUG();
 	ide_setup_pci_device(dev, d);
+	return 0;
 }
 
-int __init nforce_scan_pcidev (struct pci_dev *dev)
+/**
+ *	nforce_remove_one	-	called with an nForce is unplugged
+ *	@dev: the device that was removed
+ *
+ *	Disconnect an nForce device that has been unplugged either by hotplug
+ *	or by a more civilized notification scheme. Not yet supported.
+ */
+ 
+static void nforce_remove_one(struct pci_dev *dev)
 {
-	ide_pci_device_t *d;
+	panic("nForce removal not yet supported");
+}
 
-	if (dev->vendor != PCI_VENDOR_ID_NVIDIA)
-		return 0;
+static struct pci_device_id nforce_pci_tbl[] __devinitdata = {
+	{ PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_IDE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ 0, },
+};
 
-	for (d = nvidia_chipsets; d && d->vendor && d->device; ++d) {
-		if (((d->vendor == dev->vendor) &&
-		     (d->device == dev->device)) &&
-		    (d->init_setup)) {
-			d->init_setup(dev, d);
-			return 1;
-		}
-	}
-	return 0;
+static struct pci_driver driver = {
+	name:		"nForce IDE",
+	id_table:	nforce_pci_tbl,
+	probe:		nforce_init_one,
+	remove:		__devexit_p(nforce_remove_one),
+};
+
+static int nforce_ide_init(void)
+{
+	return ide_pci_register_driver(&driver);
+}
+
+static void nforce_ide_exit(void)
+{
+	ide_pci_unregister_driver(&driver);
 }
 
+module_init(nforce_ide_init);
+module_exit(nforce_ide_exit);
+
+MODULE_AUTHOR("Andre Hedrick");
+MODULE_DESCRIPTION("PCI driver module for nVidia nForce IDE");
+MODULE_LICENSE("GPL");
+
+EXPORT_NO_SYMBOLS;
diff --git a/drivers/ide/pci/nvidia.h b/drivers/ide/pci/nvidia.h
index fe0de703ca4a..2ba7c8b5bcfd 100644
--- a/drivers/ide/pci/nvidia.h
+++ b/drivers/ide/pci/nvidia.h
@@ -25,7 +25,6 @@ static ide_pci_host_proc_t nforce_procs[] __initdata = {
 };
 #endif  /* defined(DISPLAY_NFORCE_TIMINGS) && defined(CONFIG_PROC_FS) */
 
-static void init_setup_nforce(struct pci_dev *, ide_pci_device_t *);
 static unsigned int init_chipset_nforce(struct pci_dev *, const char *);
 static void init_hwif_nforce(ide_hwif_t *);
 static void init_dma_nforce(ide_hwif_t *, unsigned long);
@@ -35,7 +34,6 @@ static ide_pci_device_t nvidia_chipsets[] __initdata = {
 		vendor:		PCI_VENDOR_ID_NVIDIA,
 		device:		PCI_DEVICE_ID_NVIDIA_NFORCE_IDE,
 		name:		"NFORCE",
-		init_setup:	init_setup_nforce,
 		init_chipset:	init_chipset_nforce,
 		init_iops:	NULL,
 		init_hwif:	init_hwif_nforce,
@@ -45,12 +43,8 @@ static ide_pci_device_t nvidia_chipsets[] __initdata = {
 		enablebits:	{{0x50,0x01,0x01}, {0x50,0x02,0x02}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
-		vendor:		0,
-		device:		0,
-		channels:	0,
-		bootable:	EOL,
 	}
 };
 
+
 #endif /* NFORCE_H */
diff --git a/drivers/ide/pci/opti621.c b/drivers/ide/pci/opti621.c
index b671598a088d..02b4b1ee6acc 100644
--- a/drivers/ide/pci/opti621.c
+++ b/drivers/ide/pci/opti621.c
@@ -1,5 +1,5 @@
 /*
- *  linux/drivers/ide/opti621.c		Version 0.6	Jan 02, 1999
+ *  linux/drivers/ide/opti621.c		Version 0.7	Sept 10, 2002
  *
  *  Copyright (C) 1996-1998  Linus Torvalds & authors (see below)
  */
@@ -91,6 +91,7 @@
 #define OPTI621_DEBUG		/* define for debug messages */
 
 #include <linux/types.h>
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
 #include <linux/timer.h>
@@ -363,21 +364,56 @@ static void __init init_setup_opti621 (struct pci_dev *dev, ide_pci_device_t *d)
 	ide_setup_pci_device(dev, d);
 }
 
-int __init opti621_scan_pcidev (struct pci_dev *dev)
+static int __devinit opti621_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
-	ide_pci_device_t *d;
+	ide_pci_device_t *d = &opti621_chipsets[id->driver_data];
+	if (dev->device != d->device)
+		BUG();
+	ide_setup_pci_device(dev, d);
+	return 0;
+}
 
-	if (dev->vendor != PCI_VENDOR_ID_OPTI)
-		return 0;
+/**
+ *	opti621_remove_one	-	called with an Opti621 is unplugged
+ *	@dev: the device that was removed
+ *
+ *	Disconnect an Opti621 device that has been unplugged either by hotplug
+ *	or by a more civilized notification scheme. Not yet supported.
+ */
+ 
+static void opti621_remove_one(struct pci_dev *dev)
+{
+	panic("Opti621 removal not yet supported");
+}
 
-	for (d = opti621_chipsets; d && d->vendor && d->device; ++d) {
-		if (((d->vendor == dev->vendor) &&
-		     (d->device == dev->device)) &&
-		    (d->init_setup)) {
-			d->init_setup(dev, d);
-			return 1;
-		}
-	}
-	return 0;
+static struct pci_device_id opti621_pci_tbl[] __devinitdata = {
+	{ PCI_VENDOR_ID_OPTI, PCI_DEVICE_ID_OPTI_82C621, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ PCI_VENDOR_ID_OPTI, PCI_DEVICE_ID_OPTI_82C825, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 1},
+	{ 0, },
+};
+
+static struct pci_driver driver = {
+	name:		"Opti621 IDE",
+	id_table:	opti621_pci_tbl,
+	probe:		opti621_init_one,
+	remove:		__devexit_p(opti621_remove_one),
+};
+
+static int opti621_ide_init(void)
+{
+	return ide_pci_register_driver(&driver);
 }
 
+static void opti621_ide_exit(void)
+{
+	ide_pci_unregister_driver(&driver);
+}
+
+module_init(opti621_ide_init);
+module_exit(opti621_ide_exit);
+
+MODULE_AUTHOR("Jaromir Koutek, Jan Harkes, Mark Lord");
+MODULE_DESCRIPTION("PCI driver module for Opti621 IDE");
+MODULE_LICENSE("GPL");
+
+EXPORT_NO_SYMBOLS;
diff --git a/drivers/ide/pci/opti621.h b/drivers/ide/pci/opti621.h
index f513326d9a2d..4db774085684 100644
--- a/drivers/ide/pci/opti621.h
+++ b/drivers/ide/pci/opti621.h
@@ -10,7 +10,7 @@ static void init_hwif_opti621(ide_hwif_t *);
 static void init_dma_opti621(ide_hwif_t *, unsigned long);
 
 static ide_pci_device_t opti621_chipsets[] __initdata = {
-	{
+	{	/* 0 */
 		vendor:		PCI_VENDOR_ID_OPTI,
 		device:		PCI_DEVICE_ID_OPTI_82C621,
 		name:		"OPTI621",
@@ -24,7 +24,7 @@ static ide_pci_device_t opti621_chipsets[] __initdata = {
 		enablebits:	{{0x45,0x80,0x00}, {0x40,0x08,0x00}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 1 */
 		vendor:		PCI_VENDOR_ID_OPTI,
 		device:		PCI_DEVICE_ID_OPTI_82C825,
 		name:		"OPTI621X",
diff --git a/drivers/ide/pci/pdcadma.c b/drivers/ide/pci/pdcadma.c
index 558007ab81d6..df801c9a5bbf 100644
--- a/drivers/ide/pci/pdcadma.c
+++ b/drivers/ide/pci/pdcadma.c
@@ -1,5 +1,5 @@
 /*
- * linux/drivers/ide/pdcadma.c		Version 0.01	June 21, 2001
+ * linux/drivers/ide/pdcadma.c		Version 0.05	Sept 10, 2002
  *
  * Copyright (C) 1999-2000		Andre Hedrick <andre@linux-ide.org>
  * May be copied or modified under the terms of the GNU General Public License
@@ -127,26 +127,55 @@ static void __init init_dma_pdcadma (ide_hwif_t *hwif, unsigned long dmabase)
 
 extern void ide_setup_pci_device(struct pci_dev *, ide_pci_device_t *);
 
-static void __init init_setup_pdcadma (struct pci_dev *dev, ide_pci_device_t *d)
+static int __devinit pdcadma_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
+	ide_pci_device_t *d = &pdcadma_chipsets[id->driver_data];
+	if (dev->device != d->device)
+		BUG();
 	ide_setup_pci_device(dev, d);
+	return 1;
 }
 
-int __init pdcadma_scan_pcidev (struct pci_dev *dev)
+/**
+ *	pdcadma_remove_one	-	called when a PDCADMA is unplugged
+ *	@dev: the device that was removed
+ *
+ *	Disconnect a PDCADMA device that has been unplugged either by hotplug
+ *	or by a more civilized notification scheme. Not yet supported.
+ */
+ 
+static void pdcadma_remove_one(struct pci_dev *dev)
 {
-	ide_pci_device_t *d;
-
-	if (dev->vendor != PCI_VENDOR_ID_PDC)
-		return 0;
-
-	for (d = pdcadma_chipsets; d && d->vendor && d->device; ++d) {
-		if (((d->vendor == dev->vendor) &&
-		     (d->device == dev->device)) &&
-		    (d->init_setup)) {
-			d->init_setup(dev, d);
-			return 1;
-		}
-	}
-	return 0;
+	panic("PDCADMA removal not yet supported");
+}
+
+static struct pci_device_id pdcadma_pci_tbl[] __devinitdata = {
+	{ PCI_VENDOR_ID_PDC, PCI_DEVICE_ID_PDC_1841, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ 0, },
+};
+
+static struct pci_driver driver = {
+	name:		"PDCADMA-IDE",
+	id_table:	pdcadma_pci_tbl,
+	probe:		pdcadma_init_one,
+	remove:		__devexit_p(pdcadma_remove_one),
+};
+
+static int pdcadma_ide_init(void)
+{
+	return ide_pci_register_driver(&driver);
 }
 
+static void pdcadma_ide_exit(void)
+{
+	ide_pci_unregister_driver(&driver);
+}
+
+module_init(pdcadma_ide_init);
+module_exit(pdcadma_ide_exit);
+
+MODULE_AUTHOR("Andre Hedrick");
+MODULE_DESCRIPTION("PCI driver module for PDCADMA IDE");
+MODULE_LICENSE("GPL");
+
+EXPORT_NO_SYMBOLS;
diff --git a/drivers/ide/pci/pdcadma.h b/drivers/ide/pci/pdcadma.h
index 03d13edc65b0..50f2a5646031 100644
--- a/drivers/ide/pci/pdcadma.h
+++ b/drivers/ide/pci/pdcadma.h
@@ -31,7 +31,7 @@ static void init_hwif_pdcadma(ide_hwif_t *);
 static void init_dma_pdcadma(ide_hwif_t *, unsigned long);
 
 static ide_pci_device_t pdcadma_chipsets[] __initdata = {
-	{
+	{	/* 0 */
 		vendor:		PCI_VENDOR_ID_PDC,
 		device:		PCI_DEVICE_ID_PDC_1841,
 		name:		"PDCADMA",
diff --git a/drivers/ide/pci/piix.c b/drivers/ide/pci/piix.c
index 708130689114..34b743fd1521 100644
--- a/drivers/ide/pci/piix.c
+++ b/drivers/ide/pci/piix.c
@@ -55,6 +55,7 @@
 
 #include <linux/config.h>
 #include <linux/types.h>
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/ioport.h>
 #include <linux/pci.h>
@@ -664,29 +665,79 @@ static void __init init_setup_piix (struct pci_dev *dev, ide_pci_device_t *d)
 }
 
 /**
- *	piix_scan_pcidev	-	Check for and initialize PIIX IDE
- *	@dev: PCI device to check
+ *	piix_init_one	-	called when a PIIX is found
+ *	@dev: the piix device
+ *	@id: the matching pci id
  *
- *	Checks if the passed device is an Intel PIIX device. If so the
- *	hardware is initialized and we return 1 to claim the device. If not
- *	we return 0.
+ *	Called when the PCI registration layer (or the IDE initialization)
+ *	finds a device matching our IDE device tables.
  */
  
-int __init piix_scan_pcidev (struct pci_dev *dev)
+static int __devinit piix_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
-	ide_pci_device_t *d;
+	ide_pci_device_t *d = &piix_pci_info[id->driver_data];
 
-	if (dev->vendor != PCI_VENDOR_ID_INTEL)
-		return 0;
-
-	for (d = piix_chipsets; d && d->vendor && d->device; ++d) {
-		if (((d->vendor == dev->vendor) &&
-		     (d->device == dev->device)) &&
-		    (d->init_setup)) {
-			d->init_setup(dev, d);
-			return 1;
-		}
-	}
+	if (dev->device != d->device)
+		BUG();
+	d->init_setup(dev, d);
 	return 0;
 }
 
+/**
+ *	piix_remove_one	-	called with a PIIX is unplugged
+ *	@dev: the device that was removed
+ *
+ *	Disconnect a PIIX device that has been unplugged either by hotplug
+ *	or by a more civilized notification scheme. Not yet supported.
+ */
+ 
+static void piix_remove_one(struct pci_dev *dev)
+{
+	panic("PIIX removal not yet supported");
+}
+
+static struct pci_device_id piix_pci_tbl[] __devinitdata = {
+	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371FB_0, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371FB_1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 1},
+	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371MX,   PCI_ANY_ID, PCI_ANY_ID, 0, 0, 2},
+	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371SB_1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 3},
+	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82371AB,   PCI_ANY_ID, PCI_ANY_ID, 0, 0, 4},
+	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801AB_1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 5},
+	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443MX_1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 6},
+	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801AA_1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 7},
+	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82372FB_1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 8},
+	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX,   PCI_ANY_ID, PCI_ANY_ID, 0, 0, 9},
+	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801BA_9, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 10},
+	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801BA_8, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 11},
+	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_10,PCI_ANY_ID, PCI_ANY_ID, 0, 0, 12},
+	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_11,PCI_ANY_ID, PCI_ANY_ID, 0, 0, 13},
+	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_11,PCI_ANY_ID, PCI_ANY_ID, 0, 0, 14},
+	{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801E_11, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 15},
+	{ 0, },
+};
+
+static struct pci_driver driver = {
+	name:		"PIIX IDE",
+	id_table:	piix_pci_tbl,
+	probe:		piix_init_one,
+	remove:		__devexit_p(piix_remove_one),
+};
+
+static int piix_ide_init(void)
+{
+	return ide_pci_register_driver(&driver);
+}
+
+static void piix_ide_exit(void)
+{
+	ide_pci_unregister_driver(&driver);
+}
+
+module_init(piix_ide_init);
+module_exit(piix_ide_exit);
+
+MODULE_AUTHOR("Andre Hedrick, Andrzej Krzysztofowicz");
+MODULE_DESCRIPTION("PCI driver module for Intel PIIX IDE");
+MODULE_LICENSE("GPL");
+
+EXPORT_NO_SYMBOLS;
diff --git a/drivers/ide/pci/piix.h b/drivers/ide/pci/piix.h
index a51606a2d3a3..fe3073f155d1 100644
--- a/drivers/ide/pci/piix.h
+++ b/drivers/ide/pci/piix.h
@@ -28,12 +28,18 @@ static ide_pci_host_proc_t piix_procs[] __initdata = {
 #endif  /* defined(DISPLAY_PIIX_TIMINGS) && defined(CONFIG_PROC_FS) */
 
 static void init_setup_piix(struct pci_dev *, ide_pci_device_t *);
-static unsigned int init_chipset_piix(struct pci_dev *, const char *);
+static unsigned int __init init_chipset_piix(struct pci_dev *, const char *);
 static void init_hwif_piix(ide_hwif_t *);
 static void init_dma_piix(ide_hwif_t *, unsigned long);
 
-static ide_pci_device_t piix_chipsets[] __initdata = {
-	{
+
+/*
+ *	Table of the various PIIX capability blocks
+ *
+ */
+ 
+static ide_pci_device_t piix_pci_info[] __devinit = {
+	{	/* 0 */
 		vendor:		PCI_VENDOR_ID_INTEL,
 		device:		PCI_DEVICE_ID_INTEL_82371FB_0,
 		name:		"PIIXa",
@@ -47,7 +53,7 @@ static ide_pci_device_t piix_chipsets[] __initdata = {
 		enablebits:	{{0x41,0x80,0x80}, {0x43,0x80,0x80}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 1 */
 		vendor:		PCI_VENDOR_ID_INTEL,
 		device:		PCI_DEVICE_ID_INTEL_82371FB_1,
 		name:		"PIIXb",
@@ -61,7 +67,7 @@ static ide_pci_device_t piix_chipsets[] __initdata = {
 		enablebits:	{{0x41,0x80,0x80}, {0x43,0x80,0x80}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 2 */
 		vendor:		PCI_VENDOR_ID_INTEL,
 		device:		PCI_DEVICE_ID_INTEL_82371MX,
 		name:		"MPIIX",
@@ -75,7 +81,7 @@ static ide_pci_device_t piix_chipsets[] __initdata = {
 		enablebits:	{{0x6D,0x80,0x80}, {0x6F,0x80,0x80}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 3 */
 		vendor:		PCI_VENDOR_ID_INTEL,
 		device:		PCI_DEVICE_ID_INTEL_82371SB_1,
 		name:		"PIIX3",
@@ -89,7 +95,7 @@ static ide_pci_device_t piix_chipsets[] __initdata = {
 		enablebits:	{{0x41,0x80,0x80}, {0x43,0x80,0x80}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 4 */
 		vendor:		PCI_VENDOR_ID_INTEL,
 		device:		PCI_DEVICE_ID_INTEL_82371AB,
 		name:		"PIIX4",
@@ -103,7 +109,7 @@ static ide_pci_device_t piix_chipsets[] __initdata = {
 		enablebits:	{{0x41,0x80,0x80}, {0x43,0x80,0x80}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 5 */
 		vendor:		PCI_VENDOR_ID_INTEL,
 		device:		PCI_DEVICE_ID_INTEL_82801AB_1,
 		name:		"ICH0",
@@ -117,7 +123,7 @@ static ide_pci_device_t piix_chipsets[] __initdata = {
 		enablebits:	{{0x41,0x80,0x80}, {0x43,0x80,0x80}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 6 */
 		vendor:		PCI_VENDOR_ID_INTEL,
 		device:		PCI_DEVICE_ID_INTEL_82443MX_1,
 		name:		"PIIX4",
@@ -131,7 +137,7 @@ static ide_pci_device_t piix_chipsets[] __initdata = {
 		enablebits:	{{0x41,0x80,0x80}, {0x43,0x80,0x80}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 7 */
 		vendor:		PCI_VENDOR_ID_INTEL,
 		device:		PCI_DEVICE_ID_INTEL_82801AA_1,
 		name:		"ICH",
@@ -145,7 +151,7 @@ static ide_pci_device_t piix_chipsets[] __initdata = {
 		enablebits:	{{0x41,0x80,0x80}, {0x43,0x80,0x80}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 8 */
 		vendor:		PCI_VENDOR_ID_INTEL,
 		device:		PCI_DEVICE_ID_INTEL_82372FB_1,
 		name:		"PIIX4",
@@ -159,7 +165,7 @@ static ide_pci_device_t piix_chipsets[] __initdata = {
 		enablebits:	{{0x41,0x80,0x80}, {0x43,0x80,0x80}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 9 */
 		vendor:		PCI_VENDOR_ID_INTEL,
 		device:		PCI_DEVICE_ID_INTEL_82451NX,
 		name:		"PIIX4",
@@ -173,7 +179,7 @@ static ide_pci_device_t piix_chipsets[] __initdata = {
 		enablebits:	{{0x41,0x80,0x80}, {0x43,0x80,0x80}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 10 */
 		vendor:		PCI_VENDOR_ID_INTEL,
 		device:		PCI_DEVICE_ID_INTEL_82801BA_9,
 		name:		"ICH2",
@@ -187,7 +193,7 @@ static ide_pci_device_t piix_chipsets[] __initdata = {
 		enablebits:	{{0x41,0x80,0x80}, {0x43,0x80,0x80}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 11 */
 		vendor:		PCI_VENDOR_ID_INTEL,
 		device:		PCI_DEVICE_ID_INTEL_82801BA_8,
 		name:		"ICH2M",
@@ -201,7 +207,7 @@ static ide_pci_device_t piix_chipsets[] __initdata = {
 		enablebits:	{{0x41,0x80,0x80}, {0x43,0x80,0x80}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 12 */
 		vendor:		PCI_VENDOR_ID_INTEL,
 		device:		PCI_DEVICE_ID_INTEL_82801CA_10,
 		name:		"ICH3M",
@@ -215,7 +221,7 @@ static ide_pci_device_t piix_chipsets[] __initdata = {
 		enablebits:	{{0x41,0x80,0x80}, {0x43,0x80,0x80}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 13 */
 		vendor:		PCI_VENDOR_ID_INTEL,
 		device:		PCI_DEVICE_ID_INTEL_82801CA_11,
 		name:		"ICH3",
@@ -229,7 +235,7 @@ static ide_pci_device_t piix_chipsets[] __initdata = {
 		enablebits:	{{0x41,0x80,0x80}, {0x43,0x80,0x80}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 14 */
 		vendor:		PCI_VENDOR_ID_INTEL,
 		device:		PCI_DEVICE_ID_INTEL_82801DB_11,
 		name:		"ICH4",
@@ -243,7 +249,7 @@ static ide_pci_device_t piix_chipsets[] __initdata = {
 		enablebits:	{{0x41,0x80,0x80}, {0x43,0x80,0x80}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 15 */
 		vendor:		PCI_VENDOR_ID_INTEL,
 		device:		PCI_DEVICE_ID_INTEL_82801E_11,
 		name:		"C-ICH",
diff --git a/drivers/ide/pci/rz1000.c b/drivers/ide/pci/rz1000.c
index f37704838d70..26868da97c47 100644
--- a/drivers/ide/pci/rz1000.c
+++ b/drivers/ide/pci/rz1000.c
@@ -19,6 +19,7 @@
 
 #include <linux/config.h> /* for CONFIG_BLK_DEV_IDEPCI */
 #include <linux/types.h>
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
 #include <linux/timer.h>
@@ -55,26 +56,57 @@ static void __init init_hwif_rz1000 (ide_hwif_t *hwif)
 
 extern void ide_setup_pci_device(struct pci_dev *, ide_pci_device_t *);
 
-static void __init init_setup_rz1000 (struct pci_dev *dev, ide_pci_device_t *d)
+static int __devinit rz1000_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
+	ide_pci_device_t *d = &rz1000_chipsets[id->driver_data];
+	if (dev->device != d->device)
+		BUG();
 	ide_setup_pci_device(dev, d);
+	return 0;
 }
 
-int __init rz1000_scan_pcidev (struct pci_dev *dev)
+/**
+ *	rz1000_remove_one	-	called with an RZ1000 is unplugged
+ *	@dev: the device that was removed
+ *
+ *	Disconnect an RZ1000 device that has been unplugged either by hotplug
+ *	or by a more civilized notification scheme. Not yet supported.
+ */
+ 
+static void rz1000_remove_one(struct pci_dev *dev)
 {
-	ide_pci_device_t *d;
-
-	if (dev->vendor != PCI_VENDOR_ID_PCTECH)
-		return 0;
-
-	for (d = rz1000_chipsets; d && d->vendor && d->device; ++d) {
-		if (((d->vendor == dev->vendor) &&
-		     (d->device == dev->device)) &&
-		    (d->init_setup)) {
-			d->init_setup(dev, d);
-			return 1;
-		}
-	}
-	return 0;
+	panic("RZ1000 removal not yet supported");
+}
+
+static struct pci_device_id rz1000_pci_tbl[] __devinitdata = {
+	{ PCI_VENDOR_ID_PCTECH, PCI_DEVICE_ID_PCTECH_RZ1000, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ PCI_VENDOR_ID_PCTECH, PCI_DEVICE_ID_PCTECH_RZ1001, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 1},
+	{ 0, },
+};
+
+static struct pci_driver driver = {
+	name:		"RZ1000 IDE",
+	id_table:	rz1000_pci_tbl,
+	probe:		rz1000_init_one,
+	remove:		__devexit_p(rz1000_remove_one),
+};
+
+static int rz1000_ide_init(void)
+{
+	return ide_pci_register_driver(&driver);
+}
+
+static void rz1000_ide_exit(void)
+{
+	ide_pci_unregister_driver(&driver);
 }
 
+module_init(rz1000_ide_init);
+module_exit(rz1000_ide_exit);
+
+MODULE_AUTHOR("Andre Hedrick");
+MODULE_DESCRIPTION("PCI driver module for RZ1000 IDE");
+MODULE_LICENSE("GPL");
+
+EXPORT_NO_SYMBOLS;
+
diff --git a/drivers/ide/pci/rz1000.h b/drivers/ide/pci/rz1000.h
index 15fbca3a2fd9..91dbe8889436 100644
--- a/drivers/ide/pci/rz1000.h
+++ b/drivers/ide/pci/rz1000.h
@@ -5,7 +5,6 @@
 #include <linux/pci.h>
 #include <linux/ide.h>
 
-static void init_setup_rz1000(struct pci_dev *, ide_pci_device_t *);
 static void init_hwif_rz1000(ide_hwif_t *);
 
 static ide_pci_device_t rz1000_chipsets[] __initdata = {
@@ -13,7 +12,6 @@ static ide_pci_device_t rz1000_chipsets[] __initdata = {
 		vendor:		PCI_VENDOR_ID_PCTECH,
 		device:		PCI_DEVICE_ID_PCTECH_RZ1000,
 		name:		"RZ1000",
-		init_setup:	init_setup_rz1000,
 		init_chipset:	NULL,
 		init_iops:	NULL,
 		init_hwif:	init_hwif_rz1000,
@@ -27,7 +25,6 @@ static ide_pci_device_t rz1000_chipsets[] __initdata = {
 		vendor:		PCI_VENDOR_ID_PCTECH,
 		device:		PCI_DEVICE_ID_PCTECH_RZ1001,
 		name:		"RZ1001",
-		init_setup:	init_setup_rz1000,
 		init_chipset:	NULL,
 		init_iops:	NULL,
 		init_hwif:	init_hwif_rz1000,
diff --git a/drivers/ide/pci/serverworks.c b/drivers/ide/pci/serverworks.c
index b8bb7c82778b..a61cc9451b7a 100644
--- a/drivers/ide/pci/serverworks.c
+++ b/drivers/ide/pci/serverworks.c
@@ -1,5 +1,5 @@
 /*
- * linux/drivers/ide/serverworks.c		Version 0.6	05 April 2002
+ * linux/drivers/ide/serverworks.c		Version 0.7	10 Sept 2002
  *
  * Copyright (C) 1998-2000 Michel Aubry
  * Copyright (C) 1998-2000 Andrzej Krzysztofowicz
@@ -25,6 +25,7 @@
 
 #include <linux/config.h>
 #include <linux/types.h>
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/ioport.h>
 #include <linux/pci.h>
@@ -766,21 +767,73 @@ static void __init init_setup_csb6 (struct pci_dev *dev, ide_pci_device_t *d)
 	ide_setup_pci_device(dev, d);
 }
 
-int __init serverworks_scan_pcidev (struct pci_dev *dev)
-{
-	ide_pci_device_t *d;
 
-	if (dev->vendor != PCI_VENDOR_ID_SERVERWORKS)
-		return 0;
+/**
+ *	svwks_init_one	-	called when a OSB/CSB is found
+ *	@dev: the svwks device
+ *	@id: the matching pci id
+ *
+ *	Called when the PCI registration layer (or the IDE initialization)
+ *	finds a device matching our IDE device tables.
+ */
+ 
+static int __devinit svwks_init_one(struct pci_dev *dev, const struct pci_device_id *id)
+{
+	ide_pci_device_t *d = &serverworks_chipsets[id->driver_data];
 
-	for (d = serverworks_chipsets; d && d->vendor && d->device; ++d) {
-		if (((d->vendor == dev->vendor) &&
-		     (d->device == dev->device)) &&
-		    (d->init_setup)) {
-			d->init_setup(dev, d);
-			return 1;
-		}
-	}
+	if (dev->device != d->device)
+		BUG();
+	d->init_setup(dev, d);
 	return 0;
 }
+
+/**
+ *	svwks_remove_one	-	called when an OSB/CSB is unplugged
+ *	@dev: the device that was removed
+ *
+ *	Disconnect a SVWKS device that has been unplugged either by hotplug
+ *	or by a more civilized notification scheme. Not yet supported.
+ */
  
+static void svwks_remove_one(struct pci_dev *dev)
+{
+	panic("SVWKS removal not yet supported");
+}
+
+static struct pci_device_id svwks_pci_tbl[] __devinitdata = {
+	{ PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_OSB4IDE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_CSB5IDE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 1},
+	{ PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_CSB6IDE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 2},
+	{ PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_CSB6IDE2, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 3},
+	{ 0, },
+};
+
+static struct pci_driver driver = {
+	name:		"Serverworks IDE",
+	id_table:	svwks_pci_tbl,
+	probe:		svwks_init_one,
+	remove:		__devexit_p(svwks_remove_one),
+#if 0	/* FIXME: implement */
+	suspend:	,
+	resume:		,
+#endif
+};
+
+static int svwks_ide_init(void)
+{
+	return ide_pci_register_driver(&driver);
+}
+
+static void svwks_ide_exit(void)
+{
+	ide_pci_unregister_driver(&driver);
+}
+
+module_init(svwks_ide_init);
+module_exit(svwks_ide_exit);
+
+MODULE_AUTHOR("Michael Aubry. Andrzej Krzysztofowicz, Andre Hedrick");
+MODULE_DESCRIPTION("PCI driver module for Serverworks OSB4/CSB5/CSB6 IDE");
+MODULE_LICENSE("GPL");
+
+EXPORT_NO_SYMBOLS;
diff --git a/drivers/ide/pci/serverworks.h b/drivers/ide/pci/serverworks.h
index 014a72be02cc..22218e46514d 100644
--- a/drivers/ide/pci/serverworks.h
+++ b/drivers/ide/pci/serverworks.h
@@ -38,7 +38,7 @@ static void init_hwif_svwks(ide_hwif_t *);
 static void init_dma_svwks(ide_hwif_t *, unsigned long);
 
 static ide_pci_device_t serverworks_chipsets[] __initdata = {
-	{
+	{	/* 0 */
 		vendor:		PCI_VENDOR_ID_SERVERWORKS,
 		device:		PCI_DEVICE_ID_SERVERWORKS_OSB4IDE,
 		name:		"SvrWks OSB4",
@@ -52,7 +52,7 @@ static ide_pci_device_t serverworks_chipsets[] __initdata = {
 		enablebits:	{{0x00,0x00,0x00}, {0x00,0x00,0x00}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 1 */
 		vendor:		PCI_VENDOR_ID_SERVERWORKS,
 		device:		PCI_DEVICE_ID_SERVERWORKS_CSB5IDE,
 		name:		"SvrWks CSB5",
@@ -66,7 +66,7 @@ static ide_pci_device_t serverworks_chipsets[] __initdata = {
 		enablebits:	{{0x00,0x00,0x00}, {0x00,0x00,0x00}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 2 */
 		vendor:		PCI_VENDOR_ID_SERVERWORKS,
 		device:		PCI_DEVICE_ID_SERVERWORKS_CSB6IDE,
 		name:		"SvrWks CSB6",
@@ -80,7 +80,7 @@ static ide_pci_device_t serverworks_chipsets[] __initdata = {
 		enablebits:	{{0x00,0x00,0x00}, {0x00,0x00,0x00}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 3 */
 		vendor:		PCI_VENDOR_ID_SERVERWORKS,
 		device:		PCI_DEVICE_ID_SERVERWORKS_CSB6IDE2,
 		name:		"SvrWks CSB6",
diff --git a/drivers/ide/pci/siimage.c b/drivers/ide/pci/siimage.c
index a2dc72809475..bedf6e18e218 100644
--- a/drivers/ide/pci/siimage.c
+++ b/drivers/ide/pci/siimage.c
@@ -1,11 +1,12 @@
 /*
- * linux/drivers/ide/siimage.c		Version 1.00	May 9, 2002
+ * linux/drivers/ide/siimage.c		Version 1.01	Sept 11, 2002
  *
  * Copyright (C) 2001-2002	Andre Hedrick <andre@linux-ide.org>
  */
 
 #include <linux/config.h>
 #include <linux/types.h>
+#include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
 #include <linux/hdreg.h>
@@ -836,26 +837,57 @@ static void __init init_dma_siimage (ide_hwif_t *hwif, unsigned long dmabase)
 
 extern void ide_setup_pci_device(struct pci_dev *, ide_pci_device_t *);
 
-static void __init init_setup_siimage (struct pci_dev *dev, ide_pci_device_t *d)
+
+static int __devinit siimage_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
+	ide_pci_device_t *d = &siimage_chipsets[id->driver_data];
+	if (dev->device != d->device)
+		BUG();
 	ide_setup_pci_device(dev, d);
+	return 0;
 }
 
-int __init siimage_scan_pcidev (struct pci_dev *dev)
+/**
+ *	siimage_remove_one	-	called when an SI IDE is unplugged
+ *	@dev: the device that was removed
+ *
+ *	Disconnect an IDE device that has been unplugged either by hotplug
+ *	or by a more civilized notification scheme. Not yet supported.
+ */
+ 
+static void siimage_remove_one(struct pci_dev *dev)
 {
-	ide_pci_device_t *d;
+	panic("SiImage IDE removal not yet supported");
+}
 
-	if (dev->vendor != PCI_VENDOR_ID_CMD)
-		return 0;
+static struct pci_device_id siimage_pci_tbl[] __devinitdata = {
+	{ PCI_VENDOR_ID_CMD, PCI_DEVICE_ID_SII_680,  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ PCI_VENDOR_ID_CMD, PCI_DEVICE_ID_SII_3112, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 1},
+	{ 0, },
+};
 
-	for (d = siimage_chipsets; d && d->vendor && d->device; ++d) {
-		if (((d->vendor == dev->vendor) &&
-		     (d->device == dev->device)) &&
-		    (d->init_setup)) {
-			d->init_setup(dev, d);
-			return 1;
-		}
-	}
-	return 0;
+static struct pci_driver driver = {
+	name:		"SiI IDE",
+	id_table:	siimage_pci_tbl,
+	probe:		siimage_init_one,
+	remove:		__devexit_p(siimage_remove_one),
+};
+
+static int siimage_ide_init(void)
+{
+	return ide_pci_register_driver(&driver);
+}
+
+static void siimage_ide_exit(void)
+{
+	ide_pci_unregister_driver(&driver);
 }
 
+module_init(siimage_ide_init);
+module_exit(siimage_ide_exit);
+
+MODULE_AUTHOR("Andre Hedrick");
+MODULE_DESCRIPTION("PCI driver module for SiI IDE");
+MODULE_LICENSE("GPL");
+
+EXPORT_NO_SYMBOLS;
diff --git a/drivers/ide/pci/siimage.h b/drivers/ide/pci/siimage.h
index 1034f7c06d7c..4df8918eca1c 100644
--- a/drivers/ide/pci/siimage.h
+++ b/drivers/ide/pci/siimage.h
@@ -107,18 +107,16 @@ static ide_pci_host_proc_t siimage_procs[] __initdata = {
 };
 #endif /* DISPLAY_SIIMAGE_TIMINGS && CONFIG_PROC_FS */	
 
-static void init_setup_siimage(struct pci_dev *, ide_pci_device_t *);
 static unsigned int init_chipset_siimage(struct pci_dev *, const char *);
 static void init_iops_siimage(ide_hwif_t *);
 static void init_hwif_siimage(ide_hwif_t *);
 static void init_dma_siimage(ide_hwif_t *, unsigned long);
 
 static ide_pci_device_t siimage_chipsets[] __initdata = {
-	{
+	{	/* 0 */
 		vendor:		PCI_VENDOR_ID_CMD,
 		device:		PCI_DEVICE_ID_SII_680,
 		name:		"SiI680",
-		init_setup:	init_setup_siimage,
 		init_chipset:	init_chipset_siimage,
 		init_iops:	init_iops_siimage,
 		init_hwif:	init_hwif_siimage,
@@ -128,11 +126,10 @@ static ide_pci_device_t siimage_chipsets[] __initdata = {
 		enablebits:	{{0x00,0x00,0x00}, {0x00,0x00,0x00}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 1 */
 		vendor:		PCI_VENDOR_ID_CMD,
 		device:		PCI_DEVICE_ID_SII_3112,
 		name:		"SiI3112 Serial ATA",
-		init_setup:	init_setup_siimage,
 		init_chipset:	init_chipset_siimage,
 		init_iops:	init_iops_siimage,
 		init_hwif:	init_hwif_siimage,
diff --git a/drivers/ide/pci/sis5513.c b/drivers/ide/pci/sis5513.c
index 3786d62d36f4..e0325914d036 100644
--- a/drivers/ide/pci/sis5513.c
+++ b/drivers/ide/pci/sis5513.c
@@ -1,5 +1,5 @@
 /*
- * linux/drivers/ide/sis5513.c		Version 0.14	July 24, 2002
+ * linux/drivers/ide/sis5513.c		Version 0.14ac	Sept 11, 2002
  *
  * Copyright (C) 1999-2000	Andre Hedrick <andre@linux-ide.org>
  * Copyright (C) 2002		Lionel Bouton <Lionel.Bouton@inet6.fr>, Maintainer
@@ -34,6 +34,7 @@
 
 #include <linux/config.h>
 #include <linux/types.h>
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/delay.h>
 #include <linux/timer.h>
@@ -1022,26 +1023,56 @@ static void __init init_dma_sis5513 (ide_hwif_t *hwif, unsigned long dmabase)
 
 extern void ide_setup_pci_device(struct pci_dev *, ide_pci_device_t *);
 
-static void __init init_setup_sis5513 (struct pci_dev *dev, ide_pci_device_t *d)
+
+static int __devinit sis5513_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
+	ide_pci_device_t *d = &sis5513_chipsets[id->driver_data];
+	if (dev->device != d->device)
+		BUG();
 	ide_setup_pci_device(dev, d);
+	return 0;
 }
 
-int __init sis5513_scan_pcidev (struct pci_dev *dev)
+/**
+ *	sis5513_remove_one	-	called when SIS IDE is unplugged
+ *	@dev: the device that was removed
+ *
+ *	Disconnect a SIS IDE device that has been unplugged either by hotplug
+ *	or by a more civilized notification scheme. Not yet supported.
+ */
+ 
+static void sis5513_remove_one(struct pci_dev *dev)
 {
-	ide_pci_device_t *d;
+	panic("SIS IDE removal not yet supported");
+}
 
-	if (dev->vendor != PCI_VENDOR_ID_SI)
-		return 0;
+static struct pci_device_id sis5513_pci_tbl[] __devinitdata = {
+	{ PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_5513, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ 0, },
+};
 
-	for (d = sis5513_chipsets; d && d->vendor && d->device; ++d) {
-		if (((d->vendor == dev->vendor) &&
-		     (d->device == dev->device)) &&
-		    (d->init_setup)) {
-			d->init_setup(dev, d);
-			return 1;
-		}
-	}
-	return 0;
+static struct pci_driver driver = {
+	name:		"SIS IDE",
+	id_table:	sis5513_pci_tbl,
+	probe:		sis5513_init_one,
+	remove:		__devexit_p(sis5513_remove_one),
+};
+
+static int sis5513_ide_init(void)
+{
+	return ide_pci_register_driver(&driver);
 }
 
+static void sis5513_ide_exit(void)
+{
+	ide_pci_unregister_driver(&driver);
+}
+
+module_init(sis5513_ide_init);
+module_exit(sis5513_ide_exit);
+
+MODULE_AUTHOR("Lionel Bouton, L C Chang, Andre Hedrick");
+MODULE_DESCRIPTION("PCI driver module for SIS IDE");
+MODULE_LICENSE("GPL");
+
+EXPORT_NO_SYMBOLS;
diff --git a/drivers/ide/pci/sis5513.h b/drivers/ide/pci/sis5513.h
index 03240a8b322b..d15fc9c1ad8e 100644
--- a/drivers/ide/pci/sis5513.h
+++ b/drivers/ide/pci/sis5513.h
@@ -25,17 +25,15 @@ static ide_pci_host_proc_t sis_procs[] __initdata = {
 };
 #endif /* defined(DISPLAY_SIS_TIMINGS) && defined(CONFIG_PROC_FS) */
 
-static void init_setup_sis5513(struct pci_dev *, ide_pci_device_t *);
 static unsigned int init_chipset_sis5513(struct pci_dev *, const char *);
 static void init_hwif_sis5513(ide_hwif_t *);
 static void init_dma_sis5513(ide_hwif_t *, unsigned long);
 
 static ide_pci_device_t sis5513_chipsets[] __initdata = {
-	{
+	{	/* 0 */
 		vendor:		PCI_VENDOR_ID_SI,
 		device:		PCI_DEVICE_ID_SI_5513,
 		name:		"SIS5513",
-		init_setup:	init_setup_sis5513,
 		init_chipset:	init_chipset_sis5513,
 		init_iops:	NULL,
 		init_hwif:	init_hwif_sis5513,
diff --git a/drivers/ide/pci/sl82c105.c b/drivers/ide/pci/sl82c105.c
index df97cd0a1cf1..ac71aff9236c 100644
--- a/drivers/ide/pci/sl82c105.c
+++ b/drivers/ide/pci/sl82c105.c
@@ -11,6 +11,7 @@
 
 #include <linux/config.h>
 #include <linux/types.h>
+#include <linuyx/module.h>
 #include <linux/kernel.h>
 #include <linux/timer.h>
 #include <linux/mm.h>
@@ -281,26 +282,54 @@ static void __init init_hwif_sl82c105(ide_hwif_t *hwif)
 
 extern void ide_setup_pci_device(struct pci_dev *, ide_pci_device_t *);
 
-static void __init init_setup_sl82c105 (struct pci_dev *dev, ide_pci_device_t *d)
+static int __devinit sl82c105_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
+	ide_pci_device_t *d = &slc82c105_chipsets[id->driver_data];
+	if (dev->device != d->device)
+		BUG();
 	ide_setup_pci_device(dev, d);
+	return 0;
 }
 
-int __init sl82c105_scan_pcidev (struct pci_dev *dev)
+/**
+ *	sl82c105_remove_one	-	called with an SLC82c105 is unplugged
+ *	@dev: the device that was removed
+ *
+ *	Disconnect an W82C105 device that has been unplugged either by hotplug
+ *	or by a more civilized notification scheme. Not yet supported.
+ */
+ 
+static void sl82c105_remove_one(struct pci_dev *dev)
 {
-	ide_pci_device_t *d;
+	panic("W82C105 removal not yet supported");
+}
 
-	if (dev->vendor != PCI_VENDOR_ID_WINBOND)
-		return 0;
+static struct pci_device_id sl82c105_pci_tbl[] __devinitdata = {
+	{ PCI_VENDOR_ID_WINBOND, PCI_DEVICE_ID_WINBOND_82C105, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ 0, },
+};
 
-	for (d = sl82c105_chipsets; d && d->vendor && d->device; ++d) {
-		if (((d->vendor == dev->vendor) &&
-		     (d->device == dev->device)) &&
-		    (d->init_setup)) {
-			d->init_setup(dev, d);
-			return 1;
-		}
-	}
-	return 0;
+static struct pci_driver driver = {
+	name:		"W82C105 IDE",
+	id_table:	sl82c105_pci_tbl,
+	probe:		sl82c105_init_one,
+	remove:		__devexit_p(sl82c105_remove_one),
+};
+
+static int sl82c105_ide_init(void)
+{
+	return ide_pci_register_driver(&driver);
 }
 
+static void sl82c105_ide_exit(void)
+{
+	ide_pci_unregister_driver(&driver);
+}
+
+module_init(sl82c105_ide_init);
+module_exit(sl82c105_ide_exit);
+
+MODULE_DESCRIPTION("PCI driver module for W82C105 IDE");
+MODULE_LICENSE("GPL");
+
+EXPORT_NO_SYMBOLS;
diff --git a/drivers/ide/pci/sl82c105.h b/drivers/ide/pci/sl82c105.h
index 4e88028660fe..f8d9777c3892 100644
--- a/drivers/ide/pci/sl82c105.h
+++ b/drivers/ide/pci/sl82c105.h
@@ -5,17 +5,15 @@
 #include <linux/pci.h>
 #include <linux/ide.h>
 
-static void init_setup_sl82c105(struct pci_dev *, ide_pci_device_t *);
 static unsigned int init_chipset_sl82c105(struct pci_dev *, const char *);
 static void init_hwif_sl82c105(ide_hwif_t *);
 static void init_dma_sl82c105(ide_hwif_t *, unsigned long);
 
 static ide_pci_device_t sl82c105_chipsets[] __initdata = {
-	{
+	{	/* 0 */
 		vendor:		PCI_VENDOR_ID_WINBOND,
 		device:		PCI_DEVICE_ID_WINBOND_82C105,
 		name:		"W82C105",
-		init_setup:	init_setup_sl82c105,
 		init_chipset:	init_chipset_sl82c105,
 		init_iops:	NULL,
 		init_hwif:	init_hwif_sl82c105,
diff --git a/drivers/ide/pci/slc90e66.c b/drivers/ide/pci/slc90e66.c
index 1d16aaa41c54..6053612a0dca 100644
--- a/drivers/ide/pci/slc90e66.c
+++ b/drivers/ide/pci/slc90e66.c
@@ -1,5 +1,5 @@
 /*
- *  linux/drivers/ide/slc90e66.c	Version 0.10	October 4, 2000
+ *  linux/drivers/ide/slc90e66.c	Version 0.11	September 11, 2002
  *
  *  Copyright (C) 2000-2002 Andre Hedrick <andre@linux-ide.org>
  *
@@ -10,6 +10,7 @@
 
 #include <linux/config.h>
 #include <linux/types.h>
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/ioport.h>
 #include <linux/pci.h>
@@ -364,26 +365,56 @@ static void __init init_dma_slc90e66 (ide_hwif_t *hwif, unsigned long dmabase)
 
 extern void ide_setup_pci_device(struct pci_dev *, ide_pci_device_t *);
 
-static void __init init_setup_slc90e66 (struct pci_dev *dev, ide_pci_device_t *d)
+
+static int __devinit slc90e66_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
+	ide_pci_device_t *d = &slc90e66_chipsets[id->driver_data];
+	if (dev->device != d->device)
+		BUG();
 	ide_setup_pci_device(dev, d);
+	return 0;
 }
 
-int __init slc90e66_scan_pcidev (struct pci_dev *dev)
+/**
+ *	slc90e66_remove_one	-	called with an slc90e66 is unplugged
+ *	@dev: the device that was removed
+ *
+ *	Disconnect an slc90e66 device that has been unplugged either by hotplug
+ *	or by a more civilized notification scheme. Not yet supported.
+ */
+ 
+static void slc90e66_remove_one(struct pci_dev *dev)
 {
-	ide_pci_device_t *d;
+	panic("slc90e66 removal not yet supported");
+}
 
-	if (dev->vendor != PCI_VENDOR_ID_EFAR)
-		return 0;
+static struct pci_device_id slc90e66_pci_tbl[] __devinitdata = {
+	{ PCI_VENDOR_ID_EFAR, PCI_DEVICE_ID_EFAR_SLC90E66_1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ 0, },
+};
 
-	for (d = slc90e66_chipsets; d && d->vendor && d->device; ++d) {
-		if (((d->vendor == dev->vendor) &&
-		     (d->device == dev->device)) &&
-		    (d->init_setup)) {
-			d->init_setup(dev, d);
-			return 1;
-		}
-	}
-	return 0;
+static struct pci_driver driver = {
+	name:		"SLC90e66 IDE",
+	id_table:	slc90e66_pci_tbl,
+	probe:		slc90e66_init_one,
+	remove:		__devexit_p(slc90e66_remove_one),
+};
+
+static int slc90e66_ide_init(void)
+{
+	return ide_pci_register_driver(&driver);
 }
 
+static void slc90e66_ide_exit(void)
+{
+	ide_pci_unregister_driver(&driver);
+}
+
+module_init(slc90e66_ide_init);
+module_exit(slc90e66_ide_exit);
+
+MODULE_AUTHOR("Andre Hedrick");
+MODULE_DESCRIPTION("PCI driver module for SLC90E66 IDE");
+MODULE_LICENSE("GPL");
+
+EXPORT_NO_SYMBOLS;
diff --git a/drivers/ide/pci/slc90e66.h b/drivers/ide/pci/slc90e66.h
index 0d0a79b649da..58c764fe0b3b 100644
--- a/drivers/ide/pci/slc90e66.h
+++ b/drivers/ide/pci/slc90e66.h
@@ -27,17 +27,15 @@ static ide_pci_host_proc_t slc90e66_procs[] __initdata = {
 };
 #endif	/* defined(DISPLAY_SLC90E66_TIMINGS) && defined(CONFIG_PROC_FS) */
 
-static void init_setup_slc90e66(struct pci_dev *, ide_pci_device_t *);
 static unsigned int init_chipset_slc90e66(struct pci_dev *, const char *);
 static void init_hwif_slc90e66(ide_hwif_t *);
 static void init_dma_slc90e66(ide_hwif_t *, unsigned long);
 
 static ide_pci_device_t slc90e66_chipsets[] __initdata = {
-	{
+	{	/* 0 */
 		vendor:		PCI_VENDOR_ID_EFAR,
 		device:		PCI_DEVICE_ID_EFAR_SLC90E66_1,
 		name:		"SLC90E66",
-		init_setup:	init_setup_slc90e66,
 		init_chipset:	init_chipset_slc90e66,
 		init_iops:	NULL,
 		init_hwif:	init_hwif_slc90e66,
diff --git a/drivers/ide/pci/trm290.c b/drivers/ide/pci/trm290.c
index 61dd59f777f8..5cfb24411299 100644
--- a/drivers/ide/pci/trm290.c
+++ b/drivers/ide/pci/trm290.c
@@ -126,6 +126,7 @@
 
 #include <linux/config.h>
 #include <linux/types.h>
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/ioport.h>
@@ -191,7 +192,7 @@ static int trm290_ide_dma_write (ide_drive_t *drive /*, struct request *rq */)
 	trm290_prepare_drive(drive, 0);	/* select PIO xfer */
 	return 1;
 #endif
-	if (!(count = ide_build_dmatable(drive, rq))) {
+	if (!(count = ide_build_dmatable(drive, rq, PCI_DMA_TODEVICE))) {
 		/* try PIO instead of DMA */
 		trm290_prepare_drive(drive, 0); /* select PIO xfer */
 		return 1;
@@ -235,7 +236,7 @@ static int trm290_ide_dma_read (ide_drive_t *drive  /*, struct request *rq */)
 	task_ioreg_t command	= WIN_NOP;
 	unsigned int count, reading = 2, writing = 0;
 
-	if (!(count = ide_build_dmatable(drive, rq))) {
+	if (!(count = ide_build_dmatable(drive, rq, PCI_DMA_FROMDEVICE))) {
 		/* try PIO instead of DMA */
 		trm290_prepare_drive(drive, 0); /* select PIO xfer */
 		return 1;
@@ -396,26 +397,55 @@ void __init init_hwif_trm290 (ide_hwif_t *hwif)
 
 extern void ide_setup_pci_device(struct pci_dev *, ide_pci_device_t *);
 
-void __init init_setup_trm290 (struct pci_dev *dev, ide_pci_device_t *d)
+static int __devinit trm290_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
+	ide_pci_device_t *d = &trm290_chipsets[id->driver_data];
+	if (dev->device != d->device)
+		BUG();
 	ide_setup_pci_device(dev, d);
+	return 0;
 }
 
-int __init trm290_scan_pcidev (struct pci_dev *dev)
+/**
+ *	trm290_remove_one	-	called when an trm290 is unplugged
+ *	@dev: the device that was removed
+ *
+ *	Disconnect a trm290 device that has been unplugged either by hotplug
+ *	or by a more civilized notification scheme. Not yet supported.
+ */
+ 
+static void trm290_remove_one(struct pci_dev *dev)
 {
-	ide_pci_device_t *d;
+	panic("trm290 removal not yet supported");
+}
 
-	if (dev->vendor != PCI_VENDOR_ID_TEKRAM)
-		return 0;
+static struct pci_device_id trm290_pci_tbl[] __devinitdata = {
+	{ PCI_VENDOR_ID_TEKRAM, PCI_DEVICE_ID_TEKRAM_DC290, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ 0, },
+};
 
-	for (d = trm290_chipsets; d && d->vendor && d->device; ++d) {
-		if (((d->vendor == dev->vendor) &&
-		     (d->device == dev->device)) &&
-		    (d->init_setup)) {
-			d->init_setup(dev, d);
-			return 1;
-		}
-	}
-	return 0;
+static struct pci_driver driver = {
+	name:		"TRM290 IDE",
+	id_table:	trm290_pci_tbl,
+	probe:		trm290_init_one,
+	remove:		__devexit_p(trm290_remove_one),
+};
+
+static int trm290_ide_init(void)
+{
+	return ide_pci_register_driver(&driver);
+}
+
+static void trm290_ide_exit(void)
+{
+	ide_pci_unregister_driver(&driver);
 }
 
+module_init(trm290_ide_init);
+module_exit(trm290_ide_exit);
+
+MODULE_AUTHOR("Mark Lord");
+MODULE_DESCRIPTION("PCI driver module for Tekram TRM290 IDE");
+MODULE_LICENSE("GPL");
+
+EXPORT_NO_SYMBOLS;
diff --git a/drivers/ide/pci/trm290.h b/drivers/ide/pci/trm290.h
index 038cb39f6d04..81bb6194ef7e 100644
--- a/drivers/ide/pci/trm290.h
+++ b/drivers/ide/pci/trm290.h
@@ -5,15 +5,13 @@
 #include <linux/pci.h>
 #include <linux/ide.h>
 
-extern void init_setup_trm290(struct pci_dev *, ide_pci_device_t *);
 extern void init_hwif_trm290(ide_hwif_t *);
 
 static ide_pci_device_t trm290_chipsets[] __initdata = {
-	{
+	{	/* 0 */
 		vendor:		PCI_VENDOR_ID_TEKRAM,
 		device:		PCI_DEVICE_ID_TEKRAM_DC290,
 		name:		"TRM290",
-		init_setup:	init_setup_trm290,
 		init_chipset:	NULL,
 		init_iops:	NULL,
 		init_hwif:	init_hwif_trm290,
diff --git a/drivers/ide/pci/via82cxxx.c b/drivers/ide/pci/via82cxxx.c
index ee899bb839cf..59736cfcb88d 100644
--- a/drivers/ide/pci/via82cxxx.c
+++ b/drivers/ide/pci/via82cxxx.c
@@ -1,5 +1,5 @@
 /*
- * $Id: via82cxxx.c,v 3.35-ac1 2002/08/19 Alan Exp $
+ * $Id: via82cxxx.c,v 3.35-ac2 2002/09/111 Alan Exp $
  *
  *  Copyright (c) 2000-2001 Vojtech Pavlik
  *
@@ -33,6 +33,7 @@
  */
 
 #include <linux/config.h>
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/ioport.h>
 #include <linux/blkdev.h>
@@ -161,7 +162,7 @@ static int via_get_info(char *buffer, char **addr, off_t offset, int count)
 	via_print("Highest DMA rate:                   %s",
 		via_dma[via_config->flags & VIA_UDMA]);
 
-	via_print("BM-DMA base:                        %#x", via_base);
+	via_print("BM-DMA base:                        %#lx", via_base);
 	via_print("PCI clock:                          %d.%dMHz",
 		via_clock / 1000, via_clock / 100 % 10);
 
@@ -634,38 +635,56 @@ static void __init init_dma_via82cxxx(ide_hwif_t *hwif, unsigned long dmabase)
 
 extern void ide_setup_pci_device(struct pci_dev *, ide_pci_device_t *);
 
-/*
- *	FIXME: should not be needed
- */
- 
-static void __init init_setup_via82cxxx (struct pci_dev *dev, ide_pci_device_t *d)
+static int __devinit via_init_one(struct pci_dev *dev, const struct pci_device_id *id)
 {
+	ide_pci_device_t *d = &via82cxxx_chipsets[id->driver_data];
+	if (dev->device != d->device)
+		BUG();
 	ide_setup_pci_device(dev, d);
+	return 0;
 }
 
 /**
- *	via82cxxx_scan_pcidev	-	set up any via devices
- *	@dev: PCI device we are offered
+ *	via_remove_one	-	called with a VIA IDE interface is unplugged
+ *	@dev: the device that was removed
  *
- *	Any controller we are offered that we can configure we set up 
- *	and return 1. Anything we cannot drive we return 0
+ *	Disconnect a VIA IDE device that has been unplugged either by hotplug
+ *	or by a more civilized notification scheme. Not yet supported.
  */
  
-int __init via82cxxx_scan_pcidev (struct pci_dev *dev)
+static void via_remove_one(struct pci_dev *dev)
 {
-	ide_pci_device_t *d;
+	panic("VIA IDE removal not yet supported");
+}
 
-	if (dev->vendor != PCI_VENDOR_ID_VIA)
-		return 0;
+static struct pci_device_id via_pci_tbl[] __devinitdata = {
+	{ PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C576_1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C586_1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 1},
+	{ 0, },
+};
 
-	for (d = via82cxxx_chipsets; d && d->vendor && d->device; ++d) {
-		if (((d->vendor == dev->vendor) &&
-		     (d->device == dev->device)) &&
-		    (d->init_setup)) {
-			d->init_setup(dev, d);
-			return 1;
-		}
-	}
-	return 0;
+static struct pci_driver driver = {
+	name:		"VIA IDE",
+	id_table:	via_pci_tbl,
+	probe:		via_init_one,
+	remove:		__devexit_p(via_remove_one),
+};
+
+static int via_ide_init(void)
+{
+	return ide_pci_register_driver(&driver);
+}
+
+static void via_ide_exit(void)
+{
+	ide_pci_unregister_driver(&driver);
 }
 
+module_init(via_ide_init);
+module_exit(via_ide_exit);
+
+MODULE_AUTHOR("Vojtech Pavlik, Michel Aubry, Jeff Garzik, Andre Hedrick");
+MODULE_DESCRIPTION("PCI driver module for VIA IDE");
+MODULE_LICENSE("GPL");
+
+EXPORT_NO_SYMBOLS;
diff --git a/drivers/ide/pci/via82cxxx.h b/drivers/ide/pci/via82cxxx.h
index a0304f4cfccd..16349b338122 100644
--- a/drivers/ide/pci/via82cxxx.h
+++ b/drivers/ide/pci/via82cxxx.h
@@ -25,17 +25,15 @@ static ide_pci_host_proc_t via_procs[] __initdata = {
 };
 #endif /* DISPLAY_VIA_TIMINGS && CONFIG_PROC_FS */
 
-static void init_setup_via82cxxx(struct pci_dev *, ide_pci_device_t *);
 static unsigned int init_chipset_via82cxxx(struct pci_dev *, const char *);
 static void init_hwif_via82cxxx(ide_hwif_t *);
 static void init_dma_via82cxxx(ide_hwif_t *, unsigned long);
 
 static ide_pci_device_t via82cxxx_chipsets[] __initdata = {
-	{
+	{	/* 0 */
 		vendor:		PCI_VENDOR_ID_VIA,
 		device:		PCI_DEVICE_ID_VIA_82C576_1,
 		name:		"VP_IDE",
-		init_setup:	init_setup_via82cxxx,
 		init_chipset:	init_chipset_via82cxxx,
 		init_iops:	NULL,
 		init_hwif:	init_hwif_via82cxxx,
@@ -45,11 +43,10 @@ static ide_pci_device_t via82cxxx_chipsets[] __initdata = {
 		enablebits:	{{0x40,0x02,0x02}, {0x40,0x01,0x01}},
 		bootable:	ON_BOARD,
 		extra:		0,
-	},{
+	},{	/* 1 */
 		vendor:		PCI_VENDOR_ID_VIA,
 		device:		PCI_DEVICE_ID_VIA_82C586_1,
 		name:		"VP_IDE",
-		init_setup:	init_setup_via82cxxx,
 		init_chipset:	init_chipset_via82cxxx,
 		init_iops:	NULL,
 		init_hwif:	init_hwif_via82cxxx,
diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c
index c447682419f6..66a95231b773 100644
--- a/drivers/ide/setup-pci.c
+++ b/drivers/ide/setup-pci.c
@@ -712,6 +712,8 @@ void ide_setup_pci_device (struct pci_dev *dev, ide_pci_device_t *d)
 		probe_hwif_init(&ide_hwifs[index_list.b.high]);
 }
 
+EXPORT_SYMBOL_GPL(ide_setup_pci_device);
+
 void ide_setup_pci_devices (struct pci_dev *dev, struct pci_dev *dev2, ide_pci_device_t *d)
 {
 	ata_index_t index_list  = do_ide_setup_pci_device(dev, d, 1);
@@ -727,152 +729,10 @@ void ide_setup_pci_devices (struct pci_dev *dev, struct pci_dev *dev2, ide_pci_d
 		probe_hwif_init(&ide_hwifs[index_list2.b.high]);
 }
 
-/*
- * ide_scan_pcibus() gets invoked at boot time from ide.c.
- * It finds all PCI IDE controllers and calls ide_setup_pci_device for them.
- */
-void __init ide_scan_pcidev (struct pci_dev *dev)
-{
-#if 0
-	printk(" PCI slot %s, VID=%04x, DID=%04x\n",
-		dev->slot_name, dev->vendor, dev->device);
-#endif
-
-	if ((dev->vendor == PCI_VENDOR_ID_CMD) &&
-	    (dev->device == PCI_DEVICE_ID_CMD_640)) {
-		return;
-#ifdef CONFIG_BLK_DEV_ALI15X3
-	}{
-		extern int ali15x3_scan_pcidev(struct pci_dev *dev);
-		if (ali15x3_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_ALI15X3 */
-#ifdef CONFIG_BLK_DEV_AMD74XX
-	}{
-		extern int amd74xx_scan_pcidev(struct pci_dev *dev);
-		if (amd74xx_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_AMD74XX */
-#ifdef CONFIG_BLK_DEV_CS5530
-	}{
-		extern int cs5530_scan_pcidev(struct pci_dev *dev);
-		if (cs5530_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_CS5530 */
-#ifdef CONFIG_BLK_DEV_CY82C693
-	}{
-		extern int cy82c693_scan_pcidev(struct pci_dev *dev);
-		if (cy82c693_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_CY82C693 */
-#ifdef CONFIG_BLK_DEV_IT8172
-	}{
-		extern int it8172_scan_pcidev(struct pci_dev *dev);
-		if (it8172_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_IT8172 */
-#ifdef CONFIG_BLK_DEV_NFORCE
-	}{
-		extern int nforce_scan_pcidev(struct pci_dev *dev);
-		if (nforce_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_NFORCE */
-#ifdef CONFIG_BLK_DEV_NS87415
-	}{
-		extern int ns87415_scan_pcidev(struct pci_dev *dev);
-		if (ns87415_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_NS87415 */
-#ifdef CONFIG_BLK_DEV_OPTI621
-	}{
-		extern int opti621_scan_pcidev(struct pci_dev *dev);
-		if (opti621_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_OPTI621 */
-#ifdef CONFIG_BLK_DEV_PIIX
-	}{
-		extern int piix_scan_pcidev(struct pci_dev *dev);
-		if (piix_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_PIIX */
-#ifdef CONFIG_BLK_DEV_RZ1000
-	}{
-		extern int rz1000_scan_pcidev(struct pci_dev *dev);
-		if (rz1000_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_RZ1000 */
-#ifdef CONFIG_BLK_DEV_SVWKS
-	}{
-		extern int serverworks_scan_pcidev(struct pci_dev *dev);
-		if (serverworks_scan_pcidev(dev))	return;
-#endif /* CONFIG_BLK_DEV_SVWKS */
-#ifdef CONFIG_BLK_DEV_SIS5513
-	}{
-		extern int sis5513_scan_pcidev(struct pci_dev *dev);
-		if (sis5513_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_SIS5513 */
-#ifdef CONFIG_BLK_DEV_SLC90E66
-	}{
-		extern int slc90e66_scan_pcidev(struct pci_dev *dev);
-		if (slc90e66_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_SLC90E66 */
-#ifdef CONFIG_BLK_DEV_VIA82CXXX
-	}{
-		extern int via82cxxx_scan_pcidev(struct pci_dev *dev);
-		if (via82cxxx_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_VIA82CXXX */
-
-#ifdef CONFIG_BLK_DEV_AEC62XX
-	}{
-		extern int aec62xx_scan_pcidev(struct pci_dev *dev);
-		if (aec62xx_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_AEC62XX */
-#ifdef CONFIG_BLK_DEV_CMD64X
-	}{
-		extern int cmd64x_scan_pcidev(struct pci_dev *dev);
-		if (cmd64x_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_CMD64X */
-#ifdef CONFIG_BLK_DEV_HPT34X
-	}{
-		extern int hpt34x_scan_pcidev(struct pci_dev *dev);
-		if (hpt34x_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_HPT34X */
-#ifdef CONFIG_BLK_DEV_HPT366
-	}{
-		extern int hpt366_scan_pcidev(struct pci_dev *dev);
-		if (hpt366_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_HPT366 */
-#ifdef CONFIG_BLK_DEV_PDC202XX_OLD
-	}{
-		extern int pdc202xx_scan_pcidev(struct pci_dev *dev);
-		if (pdc202xx_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_PDC202XX_OLD */
-#ifdef CONFIG_BLK_DEV_PDC202XX_NEW
-	}{
-		extern int pdcnew_scan_pcidev(struct pci_dev *dev);
-		if (pdcnew_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_PDC202XX_NEW */
-#ifdef CONFIG_BLK_DEV_PDC_ADMA
-	}{
-		extern int pdcadma_scan_pcidev(struct pci_dev *dev);
-		if (pdcadma_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_PDC_ADMA */
-#ifdef CONFIG_BLK_DEV_SIIMAGE
-	}{
-		extern int siimage_scan_pcidev(struct pci_dev *dev);
-		if (siimage_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_SIIMAGE */
-#ifdef CONFIG_BLK_DEV_SL82C105
-	}{
-		extern int sl82c105_scan_pcidev(struct pci_dev *dev);
-		if (sl82c105_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_SL82C105 */
-#ifdef CONFIG_BLK_DEV_TRM290
-	}{
-		extern int trm290_scan_pcidev(struct pci_dev *dev);
-		if (trm290_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_TRM290 */
-#ifdef CONFIG_BLK_DEV_GENERIC
-	}{
-		extern int generic_scan_pcidev(struct pci_dev *dev);
-		if (generic_scan_pcidev(dev))		return;
-#endif /* CONFIG_BLK_DEV_GENERIC */
-	}
-}
-
+EXPORT_SYMBOL_GPL(ide_setup_pci_devices);
 
 /*
- *	Module interfaces - not yet functional.
+ *	Module interfaces
  */
  
 static int pre_init = 1;		/* Before first ordered IDE scan */
@@ -893,15 +753,15 @@ static LIST_HEAD(ide_pci_drivers);
  *	Returns are the same as for pci_register_driver
  */
 
-int ide_register_pci_driver(struct pci_driver *driver)
+int ide_pci_register_driver(struct pci_driver *driver)
 {
 	if(!pre_init)
-		return pci_register_driver(driver);
+		return pci_module_init(driver);
 	list_add_tail(&driver->node, &ide_pci_drivers);
 	return 0;
 }
 
-EXPORT_SYMBOL(ide_register_pci_driver);
+EXPORT_SYMBOL_GPL(ide_pci_register_driver);
 
 /**
  *	ide_unregister_pci_driver	-	unregister an IDE driver
@@ -911,7 +771,7 @@ EXPORT_SYMBOL(ide_register_pci_driver);
  *	as for pci_unregister_driver
  */
  
-void ide_unregister_pci_driver(struct pci_driver *driver)
+void ide_pci_unregister_driver(struct pci_driver *driver)
 {
 	if(!pre_init)
 		pci_unregister_driver(driver);
@@ -919,7 +779,40 @@ void ide_unregister_pci_driver(struct pci_driver *driver)
 		list_del(&driver->node);
 }
 
-EXPORT_SYMBOL(ide_unregister_pci_driver);
+EXPORT_SYMBOL_GPL(ide_pci_unregister_driver);
+
+/**
+ *	ide_scan_pcidev		-	find an IDE driver for a device
+ *	@dev: PCI device to check
+ *
+ *	Look for an IDE driver to handle the device we are considering.
+ *	This is only used during boot up to get the ordering correct. After
+ *	boot up the pci layer takes over the job.
+ */
+ 
+static int __init ide_scan_pcidev(struct pci_dev *dev)
+{
+	struct list_head *l;
+	struct pci_driver *d;
+	
+	list_for_each(l, &ide_pci_drivers)
+	{
+		d = list_entry(l, struct pci_driver, node);
+		if(d->id_table)
+		{
+			const struct pci_device_id *id = pci_match_device(d->id_table, dev);
+			if(id != NULL)
+			{
+				if(d->probe(dev, id) >= 0)
+				{
+					dev->driver = d;
+					return 1;
+				}
+			}
+		}
+	}
+	return 0;
+}
 
 /**
  *	ide_scan_pcibus		-	perform the initial IDE driver scan
@@ -933,6 +826,8 @@ EXPORT_SYMBOL(ide_unregister_pci_driver);
 void __init ide_scan_pcibus (int scan_direction)
 {
 	struct pci_dev *dev;
+	struct pci_driver *d;
+	struct list_head *l, *n;
 
 	pre_init = 0;
 	if (!scan_direction) {
@@ -944,5 +839,16 @@ void __init ide_scan_pcibus (int scan_direction)
 			ide_scan_pcidev(dev);
 		}
 	}
-	/* FIXME: now add the drivers list to the real pci probe list */
+	
+	/*
+	 *	Hand the drivers over to the PCI layer now we
+	 *	are post init.
+	 */
+
+	list_for_each_safe(l, n, &ide_pci_drivers)
+	{
+		list_del(l);
+		d = list_entry(l, struct pci_driver, node);
+		pci_register_driver(d);
+	}
 }
diff --git a/include/linux/ide.h b/include/linux/ide.h
index c9064eff5996..77bfdef78418 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -904,9 +904,7 @@ extern inline void ide_unmap_buffer(struct request *rq, char *buffer, unsigned l
     ((1<<ide_pci)|(1<<ide_cmd646)|(1<<ide_ali14xx))
 #define IDE_CHIPSET_IS_PCI(c)	((IDE_CHIPSET_PCI_MASK >> (c)) & 1)
 
-#ifdef CONFIG_BLK_DEV_IDEPCI
 struct ide_pci_device_s;
-#endif /* CONFIG_BLK_DEV_IDEPCI */
 
 typedef struct hwif_s {
 	struct hwif_s *next;		/* for linked-list in ide_hwgroup_t */
@@ -937,10 +935,8 @@ typedef struct hwif_s {
 
 	hwif_chipset_t chipset;	/* sub-module for tuning.. */
 
-#ifdef CONFIG_BLK_DEV_IDEPCI
 	struct pci_dev  *pci_dev;	/* for pci chipsets */
 	struct ide_pci_device_s	*cds;	/* chipset device struct */
-#endif /* CONFIG_BLK_DEV_IDEPCI */
 
 #if 0
 	ide_hwif_ops_t	*hwifops;
@@ -1108,12 +1104,10 @@ typedef struct hwgroup_s {
 		/* ptr to current hwif in linked-list */
 	ide_hwif_t *hwif;
 
-#ifdef CONFIG_BLK_DEV_IDEPCI
 		/* for pci chipsets */
 	struct pci_dev *pci_dev;
 		/* chipset device struct */
 	struct ide_pci_device_s *cds;
-#endif /* CONFIG_BLK_DEV_IDEPCI */
 
 		/* current request */
 	struct request *rq;
@@ -1637,23 +1631,16 @@ extern void ide_intr(int irq, void *dev_id, struct pt_regs *regs);
 extern void do_ide_request(request_queue_t *);
 extern void ide_init_subdrivers(void);
 
-#ifndef _IDE_C
 extern struct block_device_operations ide_fops[];
 extern ide_proc_entry_t generic_subdriver_entries[];
-#endif
 
 extern int ata_attach(ide_drive_t *);
 
-#ifdef _IDE_C
-#ifdef CONFIG_BLK_DEV_IDE
 extern int ideprobe_init(void);
 
-#ifdef CONFIG_BLK_DEV_IDEPCI
 extern void ide_scan_pcibus(int scan_direction) __init;
-#endif /* CONFIG_BLK_DEV_IDEPCI */
-
-#endif /* CONFIG_BLK_DEV_IDE */
-#endif /* _IDE_C */
+extern int ide_pci_register_driver(struct pci_driver *driver);
+extern void ide_pci_unregister_driver(struct pci_driver *driver);
 
 extern void default_hwif_iops(ide_hwif_t *);
 extern void default_hwif_mmiops(ide_hwif_t *);
@@ -1665,8 +1652,6 @@ int ide_register_subdriver (ide_drive_t *drive, ide_driver_t *driver, int versio
 int ide_unregister_subdriver (ide_drive_t *drive);
 int ide_replace_subdriver(ide_drive_t *drive, const char *driver);
 
-#ifdef CONFIG_BLK_DEV_IDEPCI
-
 #ifdef CONFIG_PROC_FS
 typedef struct ide_pci_host_proc_s {
 	char				*name;
@@ -1716,14 +1701,9 @@ typedef struct ide_pci_device_s {
 	struct ide_pci_device_s	*next;
 } ide_pci_device_t;
 
-#ifdef LINUX_PCI_H
 extern void ide_setup_pci_device(struct pci_dev *, ide_pci_device_t *);
 extern void ide_setup_pci_devices(struct pci_dev *, struct pci_dev *, ide_pci_device_t *);
-#endif /* LINUX_PCI_H */
-
-#endif /* CONFIG_BLK_DEV_IDEPCI */
 
-#ifdef CONFIG_BLK_DEV_IDEDMA
 #define BAD_DMA_DRIVE		0
 #define GOOD_DMA_DRIVE		1
 extern int ide_build_dmatable(ide_drive_t *, struct request *);
@@ -1750,7 +1730,6 @@ extern int __ide_dma_verbose(ide_drive_t *);
 extern int __ide_dma_retune(ide_drive_t *);
 extern int __ide_dma_lostirq(ide_drive_t *);
 extern int __ide_dma_timeout(ide_drive_t *);
-#endif /* CONFIG_BLK_DEV_IDEDMA */
 
 extern void hwif_unregister(ide_hwif_t *);
 
-- 
cgit v1.2.3


From e80bc9593fe8de4aeec2ffe0e24737a0dce22822 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@burns.home.kernel.dk>
Date: Mon, 16 Sep 2002 19:50:54 +0200
Subject: ide.h needs to include pci.h

---
 include/linux/ide.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index 77bfdef78418..6ce0a520cc70 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -17,6 +17,7 @@
 #include <linux/interrupt.h>
 #include <linux/bitops.h>
 #include <linux/bio.h>
+#include <linux/pci.h>
 #include <asm/byteorder.h>
 #include <asm/system.h>
 #include <asm/hdreg.h>
-- 
cgit v1.2.3


From 88c4297444a93e4ff6161eb3e529b2b085657285 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Sun, 15 Sep 2002 18:23:22 -0700
Subject: [PATCH] fix elevator_linus accounting

elevator_linus is seriously broken wrt accounting. Marcelo recently took
the patch to fix it in 2.4.20-pre, here's the 2.5 equiv.

Right now, we account merges as costly and seeks as not. Only thing that
prevents seek starvation is the aging scan. That is broken, very much
so. This patch fixes that to account merges and inserts differently. A
seek is ELV_LINUS_SEEK_COST more costly than a merge, currently that
define is at '16'. Doing the math on a disk, this sort of makes sense.

Defaults are read latency of 1024, which means 1024 merges or 64 seeks.
Writes are double that.
---
 drivers/block/elevator.c  | 47 ++++++++++++++---------------------------------
 drivers/block/ll_rw_blk.c |  2 --
 include/linux/elevator.h  |  5 +----
 3 files changed, 15 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c
index d5290e3b3cc7..15bfca03f35b 100644
--- a/drivers/block/elevator.c
+++ b/drivers/block/elevator.c
@@ -177,14 +177,9 @@ int elevator_linus_merge(request_queue_t *q, struct request **req,
 
 		if (__rq->flags & (REQ_BARRIER | REQ_STARTED))
 			break;
-
-		/*
-		 * simply "aging" of requests in queue
-		 */
-		if (elv_linus_sequence(__rq)-- <= 0)
-			break;
 		if (!(__rq->flags & REQ_CMD))
 			continue;
+
 		if (elv_linus_sequence(__rq) < bio_sectors(bio))
 			break;
 
@@ -200,24 +195,20 @@ int elevator_linus_merge(request_queue_t *q, struct request **req,
 		}
 	}
 
-	return ret;
-}
-
-void elevator_linus_merge_cleanup(request_queue_t *q, struct request *req, int count)
-{
-	struct list_head *entry;
-
-	BUG_ON(req->q != q);
-
 	/*
-	 * second pass scan of requests that got passed over, if any
+	 * if *req, it's either a seek or merge in the middle of the queue
 	 */
-	entry = &req->queuelist;
-	while ((entry = entry->next) != &q->queue_head) {
-		struct request *tmp;
-		tmp = list_entry_rq(entry);
-		elv_linus_sequence(tmp) -= count;
+	if (*req) {
+		struct list_head *entry = &(*req)->queuelist;
+		int cost = ret ? 1 : ELV_LINUS_SEEK_COST;
+
+		while ((entry = entry->next) != &q->queue_head) {
+			__rq = list_entry_rq(entry);
+			elv_linus_sequence(__rq) -= cost;
+		}
 	}
+
+	return ret;
 }
 
 void elevator_linus_merge_req(request_queue_t *q, struct request *req,
@@ -260,8 +251,8 @@ int elevator_linus_init(request_queue_t *q, elevator_t *e)
 	if (!latency)
 		return -ENOMEM;
 
-	latency[READ] = 8192;
-	latency[WRITE] = 16384;
+	latency[READ] = 1024;
+	latency[WRITE] = 2048;
 
 	e->elevator_data = latency;
 	return 0;
@@ -355,15 +346,6 @@ int elevator_global_init(void)
 	return 0;
 }
 
-void elv_merge_cleanup(request_queue_t *q, struct request *rq,
-		       int nr_sectors)
-{
-	elevator_t *e = &q->elevator;
-
-	if (e->elevator_merge_cleanup_fn)
-		e->elevator_merge_cleanup_fn(q, rq, nr_sectors);
-}
-
 int elv_merge(request_queue_t *q, struct request **rq, struct bio *bio)
 {
 	elevator_t *e = &q->elevator;
@@ -460,7 +442,6 @@ inline struct list_head *elv_get_sort_head(request_queue_t *q,
 
 elevator_t elevator_linus = {
 	elevator_merge_fn:		elevator_linus_merge,
-	elevator_merge_cleanup_fn:	elevator_linus_merge_cleanup,
 	elevator_merge_req_fn:		elevator_linus_merge_req,
 	elevator_next_req_fn:		elevator_noop_next_request,
 	elevator_add_req_fn:		elevator_linus_add_request,
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index b7765eb5a550..e95f7f0808f9 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -1508,7 +1508,6 @@ again:
 			req->biotail->bi_next = bio;
 			req->biotail = bio;
 			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
-			elv_merge_cleanup(q, req, nr_sectors);
 			drive_stat_acct(req, nr_sectors, 0);
 			attempt_back_merge(q, req);
 			goto out;
@@ -1532,7 +1531,6 @@ again:
 			req->hard_cur_sectors = cur_nr_sectors;
 			req->sector = req->hard_sector = sector;
 			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
-			elv_merge_cleanup(q, req, nr_sectors);
 			drive_stat_acct(req, nr_sectors, 0);
 			attempt_front_merge(q, req);
 			goto out;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 5730a5bd5a78..e98168f92e67 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -4,8 +4,6 @@
 typedef int (elevator_merge_fn) (request_queue_t *, struct request **,
 				 struct bio *);
 
-typedef void (elevator_merge_cleanup_fn) (request_queue_t *, struct request *, int);
-
 typedef void (elevator_merge_req_fn) (request_queue_t *, struct request *, struct request *);
 
 typedef struct request *(elevator_next_req_fn) (request_queue_t *);
@@ -21,7 +19,6 @@ typedef void (elevator_exit_fn) (request_queue_t *, elevator_t *);
 struct elevator_s
 {
 	elevator_merge_fn *elevator_merge_fn;
-	elevator_merge_cleanup_fn *elevator_merge_cleanup_fn;
 	elevator_merge_req_fn *elevator_merge_req_fn;
 
 	elevator_next_req_fn *elevator_next_req_fn;
@@ -42,7 +39,6 @@ struct elevator_s
  */
 extern void __elv_add_request(request_queue_t *, struct request *,
 			      struct list_head *);
-extern void elv_merge_cleanup(request_queue_t *, struct request *, int);
 extern int elv_merge(request_queue_t *, struct request **, struct bio *);
 extern void elv_merge_requests(request_queue_t *, struct request *,
 			       struct request *);
@@ -61,6 +57,7 @@ extern elevator_t elevator_noop;
  */
 extern elevator_t elevator_linus;
 #define elv_linus_sequence(rq)	((long)(rq)->elevator_private)
+#define ELV_LINUS_SEEK_COST	16
 
 /*
  * use the /proc/iosched interface, all the below is history ->
-- 
cgit v1.2.3