105 files changed, 5283 insertions, 3335 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index c8cf86596948..0c605757d63d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -919,6 +919,27 @@ config TMPFS
 
 	  See <file:Documentation/filesystems/tmpfs.txt> for details.
 
+config TMPFS_XATTR
+	bool "tmpfs Extended Attributes"
+	depends on TMPFS
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+config TMPFS_SECURITY
+	bool "tmpfs Security Labels"
+	depends on TMPFS_XATTR
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the tmpfs filesystem.
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
+
 config HUGETLBFS
 	bool "HugeTLB file system support"
 	depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || X86_64 || BROKEN
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 955dbef62b69..c8775699fb44 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -100,7 +100,7 @@ static int afs_init(void)
 		goto error;
 #endif
 
-#ifdef CONFIG_KEYS
+#ifdef CONFIG_KEYS_TURNED_OFF
 	ret = afs_key_register();
 	if (ret < 0)
 		goto error_cache;
@@ -142,7 +142,7 @@ static int afs_init(void)
  error_kafstimod:
 	afs_kafstimod_stop();
  error_keys:
-#ifdef CONFIG_KEYS
+#ifdef CONFIG_KEYS_TURNED_OFF
 	afs_key_unregister();
  error_cache:
 #endif
@@ -169,7 +169,7 @@ static void __exit afs_exit(void)
 	afs_kafstimod_stop();
 	afs_kafsasyncd_stop();
 	afs_cell_purge();
-#ifdef CONFIG_KEYS
+#ifdef CONFIG_KEYS_TURNED_OFF
 	afs_key_unregister();
 #endif
 #ifdef AFS_CACHING_SUPPORT
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 1ff9adb6a8da..82ef8ed2fabc 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -19,7 +19,6 @@
 #include <linux/smp_lock.h>
 #include "autofs_i.h"
 
-static struct dentry *autofs4_dir_lookup(struct inode *,struct dentry *, struct nameidata *);
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
 static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
@@ -29,7 +28,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file);
 static int autofs4_dir_close(struct inode *inode, struct file *file);
 static int autofs4_dir_readdir(struct file * filp, void * dirent, filldir_t filldir);
 static int autofs4_root_readdir(struct file * filp, void * dirent, filldir_t filldir);
-static struct dentry *autofs4_root_lookup(struct inode *,struct dentry *, struct nameidata *);
+static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static int autofs4_dcache_readdir(struct file *, void *, filldir_t);
 
 struct file_operations autofs4_root_operations = {
@@ -48,7 +47,7 @@ struct file_operations autofs4_dir_operations = {
 };
 
 struct inode_operations autofs4_root_inode_operations = {
-	.lookup		= autofs4_root_lookup,
+	.lookup		= autofs4_lookup,
 	.unlink		= autofs4_dir_unlink,
 	.symlink	= autofs4_dir_symlink,
 	.mkdir		= autofs4_dir_mkdir,
@@ -56,7 +55,7 @@ struct inode_operations autofs4_root_inode_operations = {
 };
 
 struct inode_operations autofs4_dir_inode_operations = {
-	.lookup		= autofs4_dir_lookup,
+	.lookup		= autofs4_lookup,
 	.unlink		= autofs4_dir_unlink,
 	.symlink	= autofs4_dir_symlink,
 	.mkdir		= autofs4_dir_mkdir,
@@ -439,23 +438,8 @@ static struct dentry_operations autofs4_dentry_operations = {
 	.d_release	= autofs4_dentry_release,
 };
 
-/* Lookups in non-root dirs never find anything - if it's there, it's
-   already in the dcache */
-static struct dentry *autofs4_dir_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-#if 0
-	DPRINTK("ignoring lookup of %.*s/%.*s",
-		 dentry->d_parent->d_name.len, dentry->d_parent->d_name.name,
-		 dentry->d_name.len, dentry->d_name.name);
-#endif
-
-	dentry->d_fsdata = NULL;
-	d_add(dentry, NULL);
-	return NULL;
-}
-
 /* Lookups in the root directory */
-static struct dentry *autofs4_root_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct autofs_sb_info *sbi;
 	int oz_mode;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 5c3f09b86172..28aac88114de 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -666,7 +666,7 @@ int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags)
 
 EXPORT_SYMBOL(blkdev_get);
 
-int blkdev_open(struct inode * inode, struct file * filp)
+static int blkdev_open(struct inode * inode, struct file * filp)
 {
 	struct block_device *bdev;
 	int res;
@@ -695,8 +695,6 @@ int blkdev_open(struct inode * inode, struct file * filp)
 	return res;
 }
 
-EXPORT_SYMBOL(blkdev_open);
-
 int blkdev_put(struct block_device *bdev)
 {
 	int ret = 0;
@@ -798,8 +796,6 @@ struct file_operations def_blk_fops = {
 	.sendfile	= generic_file_sendfile,
 };
 
-EXPORT_SYMBOL(def_blk_fops);
-
 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
 {
 	int res;
diff --git a/fs/buffer.c b/fs/buffer.c
index 4ec2acb57946..2a75b3f9efe4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -45,34 +45,6 @@ static void invalidate_bh_lrus(void);
 
 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
 
-struct bh_wait_queue {
-	struct buffer_head *bh;
-	wait_queue_t wait;
-};
-
-#define __DEFINE_BH_WAIT(name, b, f)					\
-	struct bh_wait_queue name = {					\
-		.bh	= b,						\
-		.wait	= {						\
-				.task	= current,			\
-				.flags	= f,				\
-				.func	= bh_wake_function,		\
-				.task_list =				\
-					LIST_HEAD_INIT(name.wait.task_list),\
-			},						\
-	}
-#define DEFINE_BH_WAIT(name, bh)	__DEFINE_BH_WAIT(name, bh, 0)
-#define DEFINE_BH_WAIT_EXCLUSIVE(name, bh) \
-		__DEFINE_BH_WAIT(name, bh, WQ_FLAG_EXCLUSIVE)
-
-/*
- * Hashed waitqueue_head's for wait_on_buffer()
- */
-#define BH_WAIT_TABLE_ORDER	7
-static struct bh_wait_queue_head {
-	wait_queue_head_t wqh;
-} ____cacheline_aligned_in_smp bh_wait_queue_heads[1<<BH_WAIT_TABLE_ORDER];
-
 inline void
 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 {
@@ -80,71 +52,32 @@ init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 	bh->b_private = private;
 }
 
-/*
- * Return the address of the waitqueue_head to be used for this
- * buffer_head
- */
-wait_queue_head_t *bh_waitq_head(struct buffer_head *bh)
-{
-	return &bh_wait_queue_heads[hash_ptr(bh, BH_WAIT_TABLE_ORDER)].wqh;
-}
-EXPORT_SYMBOL(bh_waitq_head);
-
-void wake_up_buffer(struct buffer_head *bh)
-{
-	wait_queue_head_t *wq = bh_waitq_head(bh);
-
-	smp_mb();
-	if (waitqueue_active(wq))
-		__wake_up(wq, TASK_INTERRUPTIBLE|TASK_UNINTERRUPTIBLE, 1, bh);
-}
-EXPORT_SYMBOL(wake_up_buffer);
-
-static int bh_wake_function(wait_queue_t *wait, unsigned mode,
-				int sync, void *key)
-{
-	struct buffer_head *bh = key;
-	struct bh_wait_queue *wq;
-
-	wq = container_of(wait, struct bh_wait_queue, wait);
-	if (wq->bh != bh || buffer_locked(bh))
-		return 0;
-	else
-		return autoremove_wake_function(wait, mode, sync, key);
-}
-
-static void sync_buffer(struct buffer_head *bh)
+static int sync_buffer(void *word)
 {
 	struct block_device *bd;
+	struct buffer_head *bh
+		= container_of(word, struct buffer_head, b_state);
 
 	smp_mb();
 	bd = bh->b_bdev;
 	if (bd)
 		blk_run_address_space(bd->bd_inode->i_mapping);
+	io_schedule();
+	return 0;
 }
 
 void fastcall __lock_buffer(struct buffer_head *bh)
 {
-	wait_queue_head_t *wqh = bh_waitq_head(bh);
-	DEFINE_BH_WAIT_EXCLUSIVE(wait, bh);
-
-	do {
-		prepare_to_wait_exclusive(wqh, &wait.wait,
-					TASK_UNINTERRUPTIBLE);
-		if (buffer_locked(bh)) {
-			sync_buffer(bh);
-			io_schedule();
-		}
-	} while (test_set_buffer_locked(bh));
-	finish_wait(wqh, &wait.wait);
+	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
+							TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__lock_buffer);
 
 void fastcall unlock_buffer(struct buffer_head *bh)
 {
 	clear_buffer_locked(bh);
-	smp_mb__after_clear_bit();
-	wake_up_buffer(bh);
+	smp_mb();
+	wake_up_bit(&bh->b_state, BH_Lock);
 }
 
 /*
@@ -154,17 +87,7 @@ void fastcall unlock_buffer(struct buffer_head *bh)
  */
 void __wait_on_buffer(struct buffer_head * bh)
 {
-	wait_queue_head_t *wqh = bh_waitq_head(bh);
-	DEFINE_BH_WAIT(wait, bh);
-
-	do {
-		prepare_to_wait(wqh, &wait.wait, TASK_UNINTERRUPTIBLE);
-		if (buffer_locked(bh)) {
-			sync_buffer(bh);
-			io_schedule();
-		}
-	} while (buffer_locked(bh));
-	finish_wait(wqh, &wait.wait);
+	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
 }
 
 static void
@@ -3123,14 +3046,11 @@ static int buffer_cpu_notify(struct notifier_block *self,
 
 void __init buffer_init(void)
 {
-	int i;
 	int nrpages;
 
 	bh_cachep = kmem_cache_create("buffer_head",
 			sizeof(struct buffer_head), 0,
 			SLAB_PANIC, init_buffer_head, NULL);
-	for (i = 0; i < ARRAY_SIZE(bh_wait_queue_heads); i++)
-		init_waitqueue_head(&bh_wait_queue_heads[i].wqh);
 
 	/*
 	 * Limit the bh occupancy to 10% of ZONE_NORMAL
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 087be36211eb..2a1924078059 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -7,8 +7,8 @@ or out of order NULL pointer checks in little used error paths).
 
 Version 1.21
 ------------
-Add new mount parm to control whether mode check (vfs_permission) is done on
-the client.  If Unix extensions are enabled and the uids on the client
+Add new mount parm to control whether mode check (generic_permission) is done
+on the client.  If Unix extensions are enabled and the uids on the client
 and server do not match, client permission checks are meaningless on
 server uids that do not exist on the client (this does not affect the
 normal ACL check which occurs on the server).  Fix default uid
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 729cdba71530..996fc298a306 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -200,7 +200,7 @@ static int cifs_permission(struct inode * inode, int mask, struct nameidata *nd)
 		on the client (above and beyond ACL on servers) for  
 		servers which do not support setting and viewing mode bits,
 		so allowing client to check permissions is useful */ 
-		return vfs_permission(inode, mask);
+		return generic_permission(inode, mask, NULL);
 }
 
 static kmem_cache_t *cifs_inode_cachep;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 2edf63444ccb..5642ccc235e8 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -115,6 +115,7 @@
 #include <linux/random.h>
 #include <linux/filter.h>
 #include <linux/msdos_fs.h>
+#include <linux/pktcdvd.h>
 
 #include <linux/hiddev.h>
 
diff --git a/fs/dcache.c b/fs/dcache.c
index ec5668a4136e..9918e4fab786 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -722,7 +722,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 	dentry->d_fsdata = NULL;
 	dentry->d_mounted = 0;
 	dentry->d_cookie = NULL;
-	dentry->d_bucket = NULL;
 	INIT_HLIST_NODE(&dentry->d_hash);
 	INIT_LIST_HEAD(&dentry->d_lru);
 	INIT_LIST_HEAD(&dentry->d_subdirs);
@@ -851,12 +850,6 @@ struct dentry * d_alloc_anon(struct inode *inode)
 		res->d_sb = inode->i_sb;
 		res->d_parent = res;
 		res->d_inode = inode;
-
-		/*
-		 * Set d_bucket to an "impossible" bucket address so
-		 * that d_move() doesn't get a false positive
-		 */
-		res->d_bucket = NULL;
 		res->d_flags |= DCACHE_DISCONNECTED;
 		res->d_flags &= ~DCACHE_UNHASHED;
 		list_add(&res->d_alias, &inode->i_dentry);
@@ -979,8 +972,6 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 
 		dentry = hlist_entry(node, struct dentry, d_hash);
 
-		smp_rmb();
-
 		if (dentry->d_name.hash != hash)
 			continue;
 		if (dentry->d_parent != parent)
@@ -989,13 +980,6 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 		spin_lock(&dentry->d_lock);
 
 		/*
-		 * If lookup ends up in a different bucket due to concurrent
-		 * rename, fail it
-		 */
-		if (unlikely(dentry->d_bucket != head))
-			goto terminate;
-
-		/*
 		 * Recheck the dentry after taking the lock - d_move may have
 		 * changed things.  Don't bother checking the hash because we're
 		 * about to compare the whole name anyway.
@@ -1003,7 +987,11 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 		if (dentry->d_parent != parent)
 			goto next;
 
-		qstr = rcu_dereference(&dentry->d_name);
+		/*
+		 * It is safe to compare names since d_move() cannot
+		 * change the qstr (protected by d_lock).
+		 */
+		qstr = &dentry->d_name;
 		if (parent->d_op && parent->d_op->d_compare) {
 			if (parent->d_op->d_compare(parent, qstr, name))
 				goto next;
@@ -1018,7 +1006,6 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 			atomic_inc(&dentry->d_count);
 			found = dentry;
 		}
-terminate:
 		spin_unlock(&dentry->d_lock);
 		break;
 next:
@@ -1110,6 +1097,13 @@ void d_delete(struct dentry * dentry)
 	spin_unlock(&dcache_lock);
 }
 
+static void __d_rehash(struct dentry * entry, struct hlist_head *list)
+{
+
+ 	entry->d_flags &= ~DCACHE_UNHASHED;
+ 	hlist_add_head_rcu(&entry->d_hash, list);
+}
+
 /**
  * d_rehash	- add an entry back to the hash
  * @entry: dentry to add to the hash
@@ -1123,10 +1117,8 @@ void d_rehash(struct dentry * entry)
 
 	spin_lock(&dcache_lock);
 	spin_lock(&entry->d_lock);
- 	entry->d_flags &= ~DCACHE_UNHASHED;
+	__d_rehash(entry, list);
 	spin_unlock(&entry->d_lock);
-	entry->d_bucket = list;
- 	hlist_add_head_rcu(&entry->d_hash, list);
 	spin_unlock(&dcache_lock);
 }
 
@@ -1204,6 +1196,8 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
 
 void d_move(struct dentry * dentry, struct dentry * target)
 {
+	struct hlist_head *list;
+
 	if (!dentry->d_inode)
 		printk(KERN_WARNING "VFS: moving negative dcache entry\n");
 
@@ -1223,13 +1217,12 @@ void d_move(struct dentry * dentry, struct dentry * target)
 	/* Move the dentry to the target hash queue, if on different bucket */
 	if (dentry->d_flags & DCACHE_UNHASHED)
 		goto already_unhashed;
-	if (dentry->d_bucket != target->d_bucket) {
-		hlist_del_rcu(&dentry->d_hash);
+
+	hlist_del_rcu(&dentry->d_hash);
+
 already_unhashed:
-		dentry->d_bucket = target->d_bucket;
-		hlist_add_head_rcu(&dentry->d_hash, target->d_bucket);
-		dentry->d_flags &= ~DCACHE_UNHASHED;
-	}
+	list = d_hash(target->d_parent, target->d_name.hash);
+	__d_rehash(dentry, list);
 
 	/* Unhash the target: dput() will then get rid of it */
 	__d_drop(target);
@@ -1239,7 +1232,6 @@ already_unhashed:
 
 	/* Switch the names.. */
 	switch_names(dentry, target);
-	smp_wmb();
 	do_switch(dentry->d_name.len, target->d_name.len);
 	do_switch(dentry->d_name.hash, target->d_name.hash);
 
@@ -1656,8 +1648,6 @@ EXPORT_SYMBOL(dget_locked);
 EXPORT_SYMBOL(dput);
 EXPORT_SYMBOL(find_inode_number);
 EXPORT_SYMBOL(have_submounts);
-EXPORT_SYMBOL(is_subdir);
 EXPORT_SYMBOL(names_cachep);
-EXPORT_SYMBOL(shrink_dcache_anon);
 EXPORT_SYMBOL(shrink_dcache_parent);
 EXPORT_SYMBOL(shrink_dcache_sb);
diff --git a/fs/devfs/base.c b/fs/devfs/base.c
index a62d9412e73e..65e299a2d97e 100644
--- a/fs/devfs/base.c
+++ b/fs/devfs/base.c
@@ -1802,7 +1802,6 @@ static int __init devfs_setup(char *str)
 
 __setup("devfs=", devfs_setup);
 
-EXPORT_SYMBOL(devfs_mk_symlink);
 EXPORT_SYMBOL(devfs_mk_dir);
 EXPORT_SYMBOL(devfs_remove);
 
diff --git a/fs/devpts/Makefile b/fs/devpts/Makefile
index 5eb7a942a0b7..5800df2e50c8 100644
--- a/fs/devpts/Makefile
+++ b/fs/devpts/Makefile
@@ -5,5 +5,4 @@
 obj-$(CONFIG_UNIX98_PTYS)		+= devpts.o
 
 devpts-$(CONFIG_UNIX98_PTYS)		:= inode.o
-devpts-$(CONFIG_DEVPTS_FS_XATTR)	+= xattr.o 
 devpts-$(CONFIG_DEVPTS_FS_SECURITY)	+= xattr_security.o
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 55ac11f70a04..49e8641609d5 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -18,10 +18,28 @@
 #include <linux/mount.h>
 #include <linux/tty.h>
 #include <linux/devpts_fs.h>
-#include "xattr.h"
+#include <linux/xattr.h>
 
 #define DEVPTS_SUPER_MAGIC 0x1cd1
 
+extern struct xattr_handler devpts_xattr_security_handler;
+
+static struct xattr_handler *devpts_xattr_handlers[] = {
+#ifdef CONFIG_DEVPTS_FS_SECURITY
+	&devpts_xattr_security_handler,
+#endif
+	NULL
+};
+
+struct inode_operations devpts_file_inode_operations = {
+#ifdef CONFIG_DEVPTS_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= generic_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+};
+
 static struct vfsmount *devpts_mnt;
 static struct dentry *devpts_root;
 
@@ -84,6 +102,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 	s->s_blocksize_bits = 10;
 	s->s_magic = DEVPTS_SUPER_MAGIC;
 	s->s_op = &devpts_sops;
+	s->s_xattr = devpts_xattr_handlers;
 
 	inode = new_inode(s);
 	if (!inode)
@@ -134,13 +153,6 @@ static struct dentry *get_node(int num)
 	return lookup_one_len(s, root, sprintf(s, "%d", num));
 }
 
-static struct inode_operations devpts_file_inode_operations = {
-	.setxattr	= devpts_setxattr,
-	.getxattr	= devpts_getxattr,
-	.listxattr	= devpts_listxattr,
-	.removexattr	= devpts_removexattr,
-};
-
 int devpts_pty_new(struct tty_struct *tty)
 {
 	int number = tty->index;
@@ -209,10 +221,7 @@ void devpts_pty_kill(int number)
 
 static int __init init_devpts_fs(void)
 {
-	int err = init_devpts_xattr();
-	if (err)
-		return err;
-	err = register_filesystem(&devpts_fs_type);
+	int err = register_filesystem(&devpts_fs_type);
 	if (!err) {
 		devpts_mnt = kern_mount(&devpts_fs_type);
 		if (IS_ERR(devpts_mnt))
@@ -225,7 +234,6 @@ static void __exit exit_devpts_fs(void)
 {
 	unregister_filesystem(&devpts_fs_type);
 	mntput(devpts_mnt);
-	exit_devpts_xattr();
 }
 
 module_init(init_devpts_fs)
diff --git a/fs/devpts/xattr.c b/fs/devpts/xattr.c
deleted file mode 100644
index db7e15c4fc30..000000000000
--- a/fs/devpts/xattr.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
-  File: fs/devpts/xattr.c
- 
-  Derived from fs/ext3/xattr.c, changed in the following ways:
-      drop everything related to persistent storage of EAs
-      pass dentry rather than inode to internal methods
-      only presently define a handler for security modules
-*/
-
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <asm/semaphore.h>
-#include "xattr.h"
-
-static struct devpts_xattr_handler *devpts_xattr_handlers[DEVPTS_XATTR_INDEX_MAX];
-static rwlock_t devpts_handler_lock = RW_LOCK_UNLOCKED;
-
-int
-devpts_xattr_register(int name_index, struct devpts_xattr_handler *handler)
-{
-	int error = -EINVAL;
-
-	if (name_index > 0 && name_index <= DEVPTS_XATTR_INDEX_MAX) {
-		write_lock(&devpts_handler_lock);
-		if (!devpts_xattr_handlers[name_index-1]) {
-			devpts_xattr_handlers[name_index-1] = handler;
-			error = 0;
-		}
-		write_unlock(&devpts_handler_lock);
-	}
-	return error;
-}
-
-void
-devpts_xattr_unregister(int name_index, struct devpts_xattr_handler *handler)
-{
-	if (name_index > 0 || name_index <= DEVPTS_XATTR_INDEX_MAX) {
-		write_lock(&devpts_handler_lock);
-		devpts_xattr_handlers[name_index-1] = NULL;
-		write_unlock(&devpts_handler_lock);
-	}
-}
-
-static inline const char *
-strcmp_prefix(const char *a, const char *a_prefix)
-{
-	while (*a_prefix && *a == *a_prefix) {
-		a++;
-		a_prefix++;
-	}
-	return *a_prefix ? NULL : a;
-}
-
-/*
- * Decode the extended attribute name, and translate it into
- * the name_index and name suffix.
- */
-static inline struct devpts_xattr_handler *
-devpts_xattr_resolve_name(const char **name)
-{
-	struct devpts_xattr_handler *handler = NULL;
-	int i;
-
-	if (!*name)
-		return NULL;
-	read_lock(&devpts_handler_lock);
-	for (i=0; i<DEVPTS_XATTR_INDEX_MAX; i++) {
-		if (devpts_xattr_handlers[i]) {
-			const char *n = strcmp_prefix(*name,
-				devpts_xattr_handlers[i]->prefix);
-			if (n) {
-				handler = devpts_xattr_handlers[i];
-				*name = n;
-				break;
-			}
-		}
-	}
-	read_unlock(&devpts_handler_lock);
-	return handler;
-}
-
-static inline struct devpts_xattr_handler *
-devpts_xattr_handler(int name_index)
-{
-	struct devpts_xattr_handler *handler = NULL;
-	if (name_index > 0 && name_index <= DEVPTS_XATTR_INDEX_MAX) {
-		read_lock(&devpts_handler_lock);
-		handler = devpts_xattr_handlers[name_index-1];
-		read_unlock(&devpts_handler_lock);
-	}
-	return handler;
-}
-
-/*
- * Inode operation getxattr()
- *
- * dentry->d_inode->i_sem down
- */
-ssize_t
-devpts_getxattr(struct dentry *dentry, const char *name,
-	      void *buffer, size_t size)
-{
-	struct devpts_xattr_handler *handler;
-
-	handler = devpts_xattr_resolve_name(&name);
-	if (!handler)
-		return -EOPNOTSUPP;
-	return handler->get(dentry, name, buffer, size);
-}
-
-/*
- * Inode operation listxattr()
- *
- * dentry->d_inode->i_sem down
- */
-ssize_t
-devpts_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
-{
-	struct devpts_xattr_handler *handler = NULL;
-	int i, error = 0;
-	unsigned int size = 0;
-	char *buf;
-
-	read_lock(&devpts_handler_lock);
-
-	for (i=0; i<DEVPTS_XATTR_INDEX_MAX; i++) {
-		handler = devpts_xattr_handlers[i];
-		if (handler)
-			size += handler->list(dentry, NULL);
-	}
-
-	if (!buffer) {
-		error = size;
-		goto out;
-	} else {
-		error = -ERANGE;
-		if (size > buffer_size)
-			goto out;
-	}
-
-	buf = buffer;
-	for (i=0; i<DEVPTS_XATTR_INDEX_MAX; i++) {
-		handler = devpts_xattr_handlers[i];
-		if (handler)
-			buf += handler->list(dentry, buf);
-	}
-	error = size;
-
-out:
-	read_unlock(&devpts_handler_lock);
-	return size;
-}
-
-/*
- * Inode operation setxattr()
- *
- * dentry->d_inode->i_sem down
- */
-int
-devpts_setxattr(struct dentry *dentry, const char *name,
-	      const void *value, size_t size, int flags)
-{
-	struct devpts_xattr_handler *handler;
-
-	if (size == 0)
-		value = "";  /* empty EA, do not remove */
-	handler = devpts_xattr_resolve_name(&name);
-	if (!handler)
-		return -EOPNOTSUPP;
-	return handler->set(dentry, name, value, size, flags);
-}
-
-/*
- * Inode operation removexattr()
- *
- * dentry->d_inode->i_sem down
- */
-int
-devpts_removexattr(struct dentry *dentry, const char *name)
-{
-	struct devpts_xattr_handler *handler;
-
-	handler = devpts_xattr_resolve_name(&name);
-	if (!handler)
-		return -EOPNOTSUPP;
-	return handler->set(dentry, name, NULL, 0, XATTR_REPLACE);
-}
-
-int __init
-init_devpts_xattr(void)
-{
-#ifdef CONFIG_DEVPTS_FS_SECURITY	
-	int	err;
-
-	err = devpts_xattr_register(DEVPTS_XATTR_INDEX_SECURITY,
-				    &devpts_xattr_security_handler);
-	if (err)
-		return err;
-#endif
-
-	return 0;
-}
-
-void
-exit_devpts_xattr(void)
-{
-#ifdef CONFIG_DEVPTS_FS_SECURITY	
-	devpts_xattr_unregister(DEVPTS_XATTR_INDEX_SECURITY,
-				&devpts_xattr_security_handler);
-#endif
-
-}
diff --git a/fs/devpts/xattr.h b/fs/devpts/xattr.h
deleted file mode 100644
index ecd74a0986a6..000000000000
--- a/fs/devpts/xattr.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
-  File: fs/devpts/xattr.h
- 
-  Derived from fs/ext3/xattr.h, changed in the following ways:
-      drop everything related to persistent storage of EAs
-      pass dentry rather than inode to internal methods
-      only presently define a handler for security modules
-*/
-
-#include <linux/config.h>
-#include <linux/xattr.h>
-
-/* Name indexes */
-#define DEVPTS_XATTR_INDEX_MAX			10
-#define DEVPTS_XATTR_INDEX_SECURITY	        1
-
-# ifdef CONFIG_DEVPTS_FS_XATTR
-
-struct devpts_xattr_handler {
-	char *prefix;
-	size_t (*list)(struct dentry *dentry, char *buffer);
-	int (*get)(struct dentry *dentry, const char *name, void *buffer,
-		   size_t size);
-	int (*set)(struct dentry *dentry, const char *name, const void *buffer,
-		   size_t size, int flags);
-};
-
-extern int devpts_xattr_register(int, struct devpts_xattr_handler *);
-extern void devpts_xattr_unregister(int, struct devpts_xattr_handler *);
-
-extern int devpts_setxattr(struct dentry *, const char *, const void *, size_t, int);
-extern ssize_t devpts_getxattr(struct dentry *, const char *, void *, size_t);
-extern ssize_t devpts_listxattr(struct dentry *, char *, size_t);
-extern int devpts_removexattr(struct dentry *, const char *);
-
-extern int init_devpts_xattr(void);
-extern void exit_devpts_xattr(void);
-
-# else  /* CONFIG_DEVPTS_FS_XATTR */
-#  define devpts_setxattr		NULL
-#  define devpts_getxattr		NULL
-#  define devpts_listxattr	NULL
-#  define devpts_removexattr	NULL
-
-static inline int
-init_devpts_xattr(void)
-{
-	return 0;
-}
-
-static inline void
-exit_devpts_xattr(void)
-{
-}
-
-# endif  /* CONFIG_DEVPTS_FS_XATTR */
-
-extern struct devpts_xattr_handler devpts_xattr_security_handler;
-
diff --git a/fs/devpts/xattr_security.c b/fs/devpts/xattr_security.c
index 4291d7b35f20..864cb5c79baa 100644
--- a/fs/devpts/xattr_security.c
+++ b/fs/devpts/xattr_security.c
@@ -1,38 +1,45 @@
 /*
- * File: fs/devpts/xattr_security.c
+ * Security xattr support for devpts.
+ *
+ * Author: Stephen Smalley <sds@epoch.ncsc.mil>
+ * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
  */
-
-#include <linux/module.h>
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/security.h>
-#include "xattr.h"
+#include <linux/xattr.h>
 
 static size_t
-devpts_xattr_security_list(struct dentry *dentry, char *buffer)
+devpts_xattr_security_list(struct inode *inode, char *list, size_t list_len,
+			   const char *name, size_t name_len)
 {
-	return security_inode_listsecurity(dentry, buffer);
+	return security_inode_listsecurity(inode, list, list_len);
 }
 
 static int
-devpts_xattr_security_get(struct dentry *dentry, const char *name,
+devpts_xattr_security_get(struct inode *inode, const char *name,
 			  void *buffer, size_t size)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
-	return security_inode_getsecurity(dentry, name, buffer, size);
+	return security_inode_getsecurity(inode, name, buffer, size);
 }
 
 static int
-devpts_xattr_security_set(struct dentry *dentry, const char *name,
+devpts_xattr_security_set(struct inode *inode, const char *name,
 			  const void *value, size_t size, int flags)
 {
 	if (strcmp(name, "") == 0)
 		return -EINVAL;
-	return security_inode_setsecurity(dentry, name, value, size, flags);
+	return security_inode_setsecurity(inode, name, value, size, flags);
 }
 
-struct devpts_xattr_handler devpts_xattr_security_handler = {
+struct xattr_handler devpts_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.list	= devpts_xattr_security_list,
 	.get	= devpts_xattr_security_get,
diff --git a/fs/exec.c b/fs/exec.c
index e715541b2db4..4915bffb045d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -34,6 +34,7 @@
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
+#include <linux/key.h>
 #include <linux/personality.h>
 #include <linux/binfmts.h>
 #include <linux/swap.h>
@@ -848,8 +849,10 @@ int flush_old_exec(struct linux_binprm * bprm)
 
 	if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || 
 	    permission(bprm->file->f_dentry->d_inode,MAY_READ, NULL) ||
-	    (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP))
+	    (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) {
+		suid_keys(current);
 		current->mm->dumpable = 0;
+	}
 
 	/* An exec changes our domain. We are no longer part of the thread
 	   group */
@@ -883,7 +886,7 @@ int prepare_binprm(struct linux_binprm *bprm)
 	mode = inode->i_mode;
 	/*
 	 * Check execute perms again - if the caller has CAP_DAC_OVERRIDE,
-	 * vfs_permission lets a non-executable through
+	 * generic_permission lets a non-executable through
 	 */
 	if (!(mode & 0111))	/* with at least _one_ execute bit set */
 		return -EACCES;
@@ -943,6 +946,11 @@ static inline int unsafe_exec(struct task_struct *p)
 void compute_creds(struct linux_binprm *bprm)
 {
 	int unsafe;
+
+	if (bprm->e_uid != current->uid)
+		suid_keys(current);
+	exec_keys(current);
+
 	task_lock(current);
 	unsafe = unsafe_exec(current);
 	security_bprm_apply_creds(bprm, unsafe);
@@ -1179,8 +1187,6 @@ out_ret:
 	return retval;
 }
 
-EXPORT_SYMBOL(do_execve);
-
 int set_binfmt(struct linux_binfmt *new)
 {
 	struct linux_binfmt *old = current->binfmt;
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 89d1df91411f..fb716b3f5ee0 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -280,60 +280,24 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 	return error;
 }
 
-/*
- * Inode operation permission().
- *
- * inode->i_sem: don't care
- */
-int
-ext2_permission(struct inode *inode, int mask, struct nameidata *nd)
+static int
+ext2_check_acl(struct inode *inode, int mask)
 {
-	int mode = inode->i_mode;
-
-	/* Nobody gets write access to a read-only fs */
-	if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
-	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
-		return -EROFS;
-	/* Nobody gets write access to an immutable file */
-	if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
-	    return -EACCES;
-	if (current->fsuid == inode->i_uid) {
-		mode >>= 6;
-	} else if (test_opt(inode->i_sb, POSIX_ACL)) {
-		struct posix_acl *acl;
-
-		/* The access ACL cannot grant access if the group class
-		   permission bits don't contain all requested permissions. */
-		if (((mode >> 3) & mask & S_IRWXO) != mask)
-			goto check_groups;
-		acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
-		if (acl) {
-			int error = posix_acl_permission(inode, acl, mask);
-			posix_acl_release(acl);
-			if (error == -EACCES)
-				goto check_capabilities;
-			return error;
-		} else
-			goto check_groups;
-	} else {
-check_groups:
-		if (in_group_p(inode->i_gid))
-			mode >>= 3;
+	struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
+
+	if (acl) {
+		int error = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+		return error;
 	}
-	if ((mode & mask & S_IRWXO) == mask)
-		return 0;
 
-check_capabilities:
-	/* Allowed to override Discretionary Access Control? */
-	if (!(mask & MAY_EXEC) ||
-	    (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
-		if (capable(CAP_DAC_OVERRIDE))
-			return 0;
-	/* Read and search granted if capable(CAP_DAC_READ_SEARCH) */
-	if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
-	    (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
-		return 0;
-	return -EACCES;
+	return -EAGAIN;
+}
+
+int
+ext2_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+	return generic_permission(inode, mask, ext2_check_acl);
 }
 
 /*
@@ -429,27 +393,27 @@ ext2_acl_chmod(struct inode *inode)
  * Extended attribut handlers
  */
 static size_t
-ext2_xattr_list_acl_access(char *list, struct inode *inode,
-			   const char *name, int name_len)
+ext2_xattr_list_acl_access(struct inode *inode, char *list, size_t list_size,
+			   const char *name, size_t name_len)
 {
 	const size_t size = sizeof(XATTR_NAME_ACL_ACCESS);
 
 	if (!test_opt(inode->i_sb, POSIX_ACL))
 		return 0;
-	if (list)
+	if (list && size <= list_size)
 		memcpy(list, XATTR_NAME_ACL_ACCESS, size);
 	return size;
 }
 
 static size_t
-ext2_xattr_list_acl_default(char *list, struct inode *inode,
-			    const char *name, int name_len)
+ext2_xattr_list_acl_default(struct inode *inode, char *list, size_t list_size,
+			    const char *name, size_t name_len)
 {
 	const size_t size = sizeof(XATTR_NAME_ACL_DEFAULT);
 
 	if (!test_opt(inode->i_sb, POSIX_ACL))
 		return 0;
-	if (list)
+	if (list && size <= list_size)
 		memcpy(list, XATTR_NAME_ACL_DEFAULT, size);
 	return size;
 }
@@ -541,45 +505,16 @@ ext2_xattr_set_acl_default(struct inode *inode, const char *name,
 	return ext2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
 }
 
-struct ext2_xattr_handler ext2_xattr_acl_access_handler = {
+struct xattr_handler ext2_xattr_acl_access_handler = {
 	.prefix	= XATTR_NAME_ACL_ACCESS,
 	.list	= ext2_xattr_list_acl_access,
 	.get	= ext2_xattr_get_acl_access,
 	.set	= ext2_xattr_set_acl_access,
 };
 
-struct ext2_xattr_handler ext2_xattr_acl_default_handler = {
+struct xattr_handler ext2_xattr_acl_default_handler = {
 	.prefix	= XATTR_NAME_ACL_DEFAULT,
 	.list	= ext2_xattr_list_acl_default,
 	.get	= ext2_xattr_get_acl_default,
 	.set	= ext2_xattr_set_acl_default,
 };
-
-void
-exit_ext2_acl(void)
-{
-	ext2_xattr_unregister(EXT2_XATTR_INDEX_POSIX_ACL_ACCESS,
-			      &ext2_xattr_acl_access_handler);
-	ext2_xattr_unregister(EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT,
-			      &ext2_xattr_acl_default_handler);
-}
-
-int __init
-init_ext2_acl(void)
-{
-	int error;
-
-	error = ext2_xattr_register(EXT2_XATTR_INDEX_POSIX_ACL_ACCESS,
-				    &ext2_xattr_acl_access_handler);
-	if (error)
-		goto fail;
-	error = ext2_xattr_register(EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT,
-				    &ext2_xattr_acl_default_handler);
-	if (error)
-		goto fail;
-	return 0;
-
-fail:
-	exit_ext2_acl();
-	return error;
-}
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index b0a1c4b38be7..f613bc33d0a2 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -63,9 +63,6 @@ extern int ext2_permission (struct inode *, int, struct nameidata *);
 extern int ext2_acl_chmod (struct inode *);
 extern int ext2_init_acl (struct inode *, struct inode *);
 
-extern int init_ext2_acl(void);
-extern void exit_ext2_acl(void);
-
 #else
 #include <linux/sched.h>
 #define ext2_permission NULL
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 713fb9b07545..f5e86141ec54 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -57,10 +57,12 @@ struct file_operations ext2_file_operations = {
 
 struct inode_operations ext2_file_inode_operations = {
 	.truncate	= ext2_truncate,
-	.setxattr	= ext2_setxattr,
-	.getxattr	= ext2_getxattr,
+#ifdef CONFIG_EXT2_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= ext2_listxattr,
-	.removexattr	= ext2_removexattr,
+	.removexattr	= generic_removexattr,
+#endif
 	.setattr	= ext2_setattr,
 	.permission	= ext2_permission,
 };
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 4abd2f06b387..661a751d15f0 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -395,19 +395,23 @@ struct inode_operations ext2_dir_inode_operations = {
 	.rmdir		= ext2_rmdir,
 	.mknod		= ext2_mknod,
 	.rename		= ext2_rename,
-	.setxattr	= ext2_setxattr,
-	.getxattr	= ext2_getxattr,
+#ifdef CONFIG_EXT2_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= ext2_listxattr,
-	.removexattr	= ext2_removexattr,
+	.removexattr	= generic_removexattr,
+#endif
 	.setattr	= ext2_setattr,
 	.permission	= ext2_permission,
 };
 
 struct inode_operations ext2_special_inode_operations = {
-	.setxattr	= ext2_setxattr,
-	.getxattr	= ext2_getxattr,
+#ifdef CONFIG_EXT2_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= ext2_listxattr,
-	.removexattr	= ext2_removexattr,
+	.removexattr	= generic_removexattr,
+#endif
 	.setattr	= ext2_setattr,
 	.permission	= ext2_permission,
 };
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index ad3face04fe4..497650e32d7f 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -800,6 +800,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	 */
 	sb->s_op = &ext2_sops;
 	sb->s_export_op = &ext2_export_ops;
+	sb->s_xattr = ext2_xattr_handlers;
 	root = iget(sb, EXT2_ROOT_INO);
 	sb->s_root = d_alloc_root(root);
 	if (!sb->s_root) {
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index 4680ea7ecc12..9f7bac01d557 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -32,17 +32,21 @@ struct inode_operations ext2_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
-	.setxattr	= ext2_setxattr,
-	.getxattr	= ext2_getxattr,
+#ifdef CONFIG_EXT2_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= ext2_listxattr,
-	.removexattr	= ext2_removexattr,
+	.removexattr	= generic_removexattr,
+#endif
 };
  
 struct inode_operations ext2_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= ext2_follow_link,
-	.setxattr	= ext2_setxattr,
-	.getxattr	= ext2_getxattr,
+#ifdef CONFIG_EXT2_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= ext2_listxattr,
-	.removexattr	= ext2_removexattr,
+	.removexattr	= generic_removexattr,
+#endif
 };
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index c8642719c170..fffddd16b064 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -6,6 +6,9 @@
  * Fix by Harrison Xing <harrison@mountainviewdata.com>.
  * Extended attributes for symlinks and special files added per
  *  suggestion of Luka Renko <luka.renko@hermes.si>.
+ * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
+ *  Red Hat Inc.
+ *
  */
 
 /*
@@ -62,8 +65,6 @@
 #include "acl.h"
 
 /* These symbols may be needed by a module. */
-EXPORT_SYMBOL(ext2_xattr_register);
-EXPORT_SYMBOL(ext2_xattr_unregister);
 EXPORT_SYMBOL(ext2_xattr_get);
 EXPORT_SYMBOL(ext2_xattr_list);
 EXPORT_SYMBOL(ext2_xattr_set);
@@ -104,101 +105,40 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *,
 			      struct ext2_xattr_entry *);
 
 static struct mb_cache *ext2_xattr_cache;
-static struct ext2_xattr_handler *ext2_xattr_handlers[EXT2_XATTR_INDEX_MAX];
-static rwlock_t ext2_handler_lock = RW_LOCK_UNLOCKED;
-
-int
-ext2_xattr_register(int name_index, struct ext2_xattr_handler *handler)
-{
-	int error = -EINVAL;
 
-	if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) {
-		write_lock(&ext2_handler_lock);
-		if (!ext2_xattr_handlers[name_index-1]) {
-			ext2_xattr_handlers[name_index-1] = handler;
-			error = 0;
-		}
-		write_unlock(&ext2_handler_lock);
-	}
-	return error;
-}
-
-void
-ext2_xattr_unregister(int name_index, struct ext2_xattr_handler *handler)
-{
-	if (name_index > 0 || name_index <= EXT2_XATTR_INDEX_MAX) {
-		write_lock(&ext2_handler_lock);
-		ext2_xattr_handlers[name_index-1] = NULL;
-		write_unlock(&ext2_handler_lock);
-	}
-}
-
-static inline const char *
-strcmp_prefix(const char *a, const char *a_prefix)
-{
-	while (*a_prefix && *a == *a_prefix) {
-		a++;
-		a_prefix++;
-	}
-	return *a_prefix ? NULL : a;
-}
+static struct xattr_handler *ext2_xattr_handler_map[EXT2_XATTR_INDEX_MAX] = {
+	[EXT2_XATTR_INDEX_USER]		     = &ext2_xattr_user_handler,
+#ifdef CONFIG_EXT2_FS_POSIX_ACL
+	[EXT2_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext2_xattr_acl_access_handler,
+	[EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext2_xattr_acl_default_handler,
+#endif
+	[EXT2_XATTR_INDEX_TRUSTED]	     = &ext2_xattr_trusted_handler,
+#ifdef CONFIG_EXT2_FS_SECURITY
+	[EXT2_XATTR_INDEX_SECURITY]	     = &ext2_xattr_security_handler,
+#endif
+};
 
-/*
- * Decode the extended attribute name, and translate it into
- * the name_index and name suffix.
- */
-static struct ext2_xattr_handler *
-ext2_xattr_resolve_name(const char **name)
-{
-	struct ext2_xattr_handler *handler = NULL;
-	int i;
-
-	if (!*name)
-		return NULL;
-	read_lock(&ext2_handler_lock);
-	for (i=0; i<EXT2_XATTR_INDEX_MAX; i++) {
-		if (ext2_xattr_handlers[i]) {
-			const char *n = strcmp_prefix(*name,
-				ext2_xattr_handlers[i]->prefix);
-			if (n) {
-				handler = ext2_xattr_handlers[i];
-				*name = n;
-				break;
-			}
-		}
-	}
-	read_unlock(&ext2_handler_lock);
-	return handler;
-}
+struct xattr_handler *ext2_xattr_handlers[] = {
+	&ext2_xattr_user_handler,
+	&ext2_xattr_trusted_handler,
+#ifdef CONFIG_EXT2_FS_POSIX_ACL
+	&ext2_xattr_acl_access_handler,
+	&ext2_xattr_acl_default_handler,
+#endif
+#ifdef CONFIG_EXT2_FS_SECURITY
+	&ext2_xattr_security_handler,
+#endif
+	NULL
+};
 
-static inline struct ext2_xattr_handler *
+static inline struct xattr_handler *
 ext2_xattr_handler(int name_index)
 {
-	struct ext2_xattr_handler *handler = NULL;
-	if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) {
-		read_lock(&ext2_handler_lock);
-		handler = ext2_xattr_handlers[name_index-1];
-		read_unlock(&ext2_handler_lock);
-	}
-	return handler;
-}
+	struct xattr_handler *handler = NULL;
 
-/*
- * Inode operation getxattr()
- *
- * dentry->d_inode->i_sem: don't care
- */
-ssize_t
-ext2_getxattr(struct dentry *dentry, const char *name,
-	      void *buffer, size_t size)
-{
-	struct ext2_xattr_handler *handler;
-	struct inode *inode = dentry->d_inode;
-
-	handler = ext2_xattr_resolve_name(&name);
-	if (!handler)
-		return -EOPNOTSUPP;
-	return handler->get(inode, name, buffer, size);
+	if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX)
+		handler = ext2_xattr_handler_map[name_index];
+	return handler;
 }
 
 /*
@@ -213,43 +153,6 @@ ext2_listxattr(struct dentry *dentry, char *buffer, size_t size)
 }
 
 /*
- * Inode operation setxattr()
- *
- * dentry->d_inode->i_sem: down
- */
-int
-ext2_setxattr(struct dentry *dentry, const char *name,
-	      const void *value, size_t size, int flags)
-{
-	struct ext2_xattr_handler *handler;
-	struct inode *inode = dentry->d_inode;
-
-	if (size == 0)
-		value = "";  /* empty EA, do not remove */
-	handler = ext2_xattr_resolve_name(&name);
-	if (!handler)
-		return -EOPNOTSUPP;
-	return handler->set(inode, name, value, size, flags);
-}
-
-/*
- * Inode operation removexattr()
- *
- * dentry->d_inode->i_sem: down
- */
-int
-ext2_removexattr(struct dentry *dentry, const char *name)
-{
-	struct ext2_xattr_handler *handler;
-	struct inode *inode = dentry->d_inode;
-
-	handler = ext2_xattr_resolve_name(&name);
-	if (!handler)
-		return -EOPNOTSUPP;
-	return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
-}
-
-/*
  * ext2_xattr_get()
  *
  * Copy an extended attribute into the buffer
@@ -367,8 +270,8 @@ ext2_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
 {
 	struct buffer_head *bh = NULL;
 	struct ext2_xattr_entry *entry;
-	size_t size = 0;
-	char *buf, *end;
+	char *end;
+	size_t rest = buffer_size;
 	int error;
 
 	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
@@ -394,44 +297,40 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_list",
 		error = -EIO;
 		goto cleanup;
 	}
-	/* compute the size required for the list of attribute names */
-	for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
-	     entry = EXT2_XATTR_NEXT(entry)) {
-		struct ext2_xattr_handler *handler;
-		struct ext2_xattr_entry *next =
-			EXT2_XATTR_NEXT(entry);
+
+	/* check the on-disk data structure */
+	entry = FIRST_ENTRY(bh);
+	while (!IS_LAST_ENTRY(entry)) {
+		struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(entry);
+
 		if ((char *)next >= end)
 			goto bad_block;
-
-		handler = ext2_xattr_handler(entry->e_name_index);
-		if (handler)
-			size += handler->list(NULL, inode, entry->e_name,
-					      entry->e_name_len);
+		entry = next;
 	}
-
 	if (ext2_xattr_cache_insert(bh))
 		ea_idebug(inode, "cache insert failed");
-	if (!buffer) {
-		error = size;
-		goto cleanup;
-	} else {
-		error = -ERANGE;
-		if (size > buffer_size)
-			goto cleanup;
-	}
 
 	/* list the attribute names */
-	buf = buffer;
 	for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
 	     entry = EXT2_XATTR_NEXT(entry)) {
-		struct ext2_xattr_handler *handler;
-		
-		handler = ext2_xattr_handler(entry->e_name_index);
-		if (handler)
-			buf += handler->list(buf, inode, entry->e_name,
-					     entry->e_name_len);
+		struct xattr_handler *handler =
+			ext2_xattr_handler(entry->e_name_index);
+
+		if (handler) {
+			size_t size = handler->list(inode, buffer, rest,
+						    entry->e_name,
+						    entry->e_name_len);
+			if (buffer) {
+				if (size > rest) {
+					error = -ERANGE;
+					goto cleanup;
+				}
+				buffer += size;
+			}
+			rest -= size;
+		}
 	}
-	error = size;
+	error = buffer_size - rest;  /* total size */
 
 cleanup:
 	brelse(bh);
@@ -1120,66 +1019,16 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *header,
 int __init
 init_ext2_xattr(void)
 {
-	int	err;
-	
-	err = ext2_xattr_register(EXT2_XATTR_INDEX_USER,
-				  &ext2_xattr_user_handler);
-	if (err)
-		return err;
-	err = ext2_xattr_register(EXT2_XATTR_INDEX_TRUSTED,
-				  &ext2_xattr_trusted_handler);
-	if (err)
-		goto out;
-#ifdef CONFIG_EXT2_FS_SECURITY
-	err = ext2_xattr_register(EXT2_XATTR_INDEX_SECURITY,
-				  &ext2_xattr_security_handler);
-	if (err)
-		goto out1;
-#endif
-#ifdef CONFIG_EXT2_FS_POSIX_ACL
-	err = init_ext2_acl();
-	if (err)
-		goto out2;
-#endif
 	ext2_xattr_cache = mb_cache_create("ext2_xattr", NULL,
 		sizeof(struct mb_cache_entry) +
 		sizeof(struct mb_cache_entry_index), 1, 6);
-	if (!ext2_xattr_cache) {
-		err = -ENOMEM;
-		goto out3;
-	}
+	if (!ext2_xattr_cache)
+		return -ENOMEM;
 	return 0;
-out3:
-#ifdef CONFIG_EXT2_FS_POSIX_ACL
-	exit_ext2_acl();
-out2:
-#endif
-#ifdef CONFIG_EXT2_FS_SECURITY
-	ext2_xattr_unregister(EXT2_XATTR_INDEX_SECURITY,
-			      &ext2_xattr_security_handler);
-out1:
-#endif
-	ext2_xattr_unregister(EXT2_XATTR_INDEX_TRUSTED,
-			      &ext2_xattr_trusted_handler);
-out:
-	ext2_xattr_unregister(EXT2_XATTR_INDEX_USER,
-			      &ext2_xattr_user_handler);
-	return err;
 }
 
 void
 exit_ext2_xattr(void)
 {
 	mb_cache_destroy(ext2_xattr_cache);
-#ifdef CONFIG_EXT2_FS_POSIX_ACL
-	exit_ext2_acl();
-#endif
-#ifdef CONFIG_EXT2_FS_SECURITY
-	ext2_xattr_unregister(EXT2_XATTR_INDEX_SECURITY,
-			      &ext2_xattr_security_handler);
-#endif
-	ext2_xattr_unregister(EXT2_XATTR_INDEX_TRUSTED,
-			      &ext2_xattr_trusted_handler);
-	ext2_xattr_unregister(EXT2_XATTR_INDEX_USER,
-			      &ext2_xattr_user_handler);
 }
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index 6268bcdaf753..65ea5f9e723d 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -57,23 +57,13 @@ struct ext2_xattr_entry {
 
 # ifdef CONFIG_EXT2_FS_XATTR
 
-struct ext2_xattr_handler {
-	char *prefix;
-	size_t (*list)(char *list, struct inode *inode, const char *name,
-		       int name_len);
-	int (*get)(struct inode *inode, const char *name, void *buffer,
-		   size_t size);
-	int (*set)(struct inode *inode, const char *name, const void *buffer,
-		   size_t size, int flags);
-};
-
-extern int ext2_xattr_register(int, struct ext2_xattr_handler *);
-extern void ext2_xattr_unregister(int, struct ext2_xattr_handler *);
+extern struct xattr_handler ext2_xattr_user_handler;
+extern struct xattr_handler ext2_xattr_trusted_handler;
+extern struct xattr_handler ext2_xattr_acl_access_handler;
+extern struct xattr_handler ext2_xattr_acl_default_handler;
+extern struct xattr_handler ext2_xattr_security_handler;
 
-extern int ext2_setxattr(struct dentry *, const char *, const void *, size_t, int);
-extern ssize_t ext2_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t ext2_listxattr(struct dentry *, char *, size_t);
-extern int ext2_removexattr(struct dentry *, const char *);
 
 extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t);
 extern int ext2_xattr_list(struct inode *, char *, size_t);
@@ -85,11 +75,9 @@ extern void ext2_xattr_put_super(struct super_block *);
 extern int init_ext2_xattr(void);
 extern void exit_ext2_xattr(void);
 
+extern struct xattr_handler *ext2_xattr_handlers[];
+
 # else  /* CONFIG_EXT2_FS_XATTR */
-#  define ext2_setxattr		NULL
-#  define ext2_getxattr		NULL
-#  define ext2_listxattr	NULL
-#  define ext2_removexattr	NULL
 
 static inline int
 ext2_xattr_get(struct inode *inode, int name_index,
@@ -132,9 +120,7 @@ exit_ext2_xattr(void)
 {
 }
 
-# endif  /* CONFIG_EXT2_FS_XATTR */
+#define ext2_xattr_handlers NULL
 
-extern struct ext2_xattr_handler ext2_xattr_user_handler;
-extern struct ext2_xattr_handler ext2_xattr_trusted_handler;
-extern struct ext2_xattr_handler ext2_xattr_security_handler;
+# endif  /* CONFIG_EXT2_FS_XATTR */
 
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 837abd7bd1ca..6a6c59fbe599 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -11,17 +11,18 @@
 #include "xattr.h"
 
 static size_t
-ext2_xattr_security_list(char *list, struct inode *inode,
-			const char *name, int name_len)
+ext2_xattr_security_list(struct inode *inode, char *list, size_t list_size,
+			 const char *name, size_t name_len)
 {
 	const int prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
+	const size_t total_len = prefix_len + name_len + 1;
 
-	if (list) {
+	if (list && total_len <= list_size) {
 		memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
 		memcpy(list+prefix_len, name, name_len);
 		list[prefix_len + name_len] = '\0';
 	}
-	return prefix_len + name_len + 1;
+	return total_len;
 }
 
 static int
@@ -44,7 +45,7 @@ ext2_xattr_security_set(struct inode *inode, const char *name,
 			      value, size, flags);
 }
 
-struct ext2_xattr_handler ext2_xattr_security_handler = {
+struct xattr_handler ext2_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.list	= ext2_xattr_security_list,
 	.get	= ext2_xattr_security_get,
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index d9478aed723f..52b30ee6a25f 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -15,20 +15,21 @@
 #define XATTR_TRUSTED_PREFIX "trusted."
 
 static size_t
-ext2_xattr_trusted_list(char *list, struct inode *inode,
-			const char *name, int name_len)
+ext2_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
+			const char *name, size_t name_len)
 {
 	const int prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+	const size_t total_len = prefix_len + name_len + 1;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return 0;
 
-	if (list) {
+	if (list && total_len <= list_size) {
 		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
 		memcpy(list+prefix_len, name, name_len);
 		list[prefix_len + name_len] = '\0';
 	}
-	return prefix_len + name_len + 1;
+	return total_len;
 }
 
 static int
@@ -55,7 +56,7 @@ ext2_xattr_trusted_set(struct inode *inode, const char *name,
 			      value, size, flags);
 }
 
-struct ext2_xattr_handler ext2_xattr_trusted_handler = {
+struct xattr_handler ext2_xattr_trusted_handler = {
 	.prefix	= XATTR_TRUSTED_PREFIX,
 	.list	= ext2_xattr_trusted_list,
 	.get	= ext2_xattr_trusted_get,
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index be1558761064..0c03ea131a94 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -14,20 +14,21 @@
 #define XATTR_USER_PREFIX "user."
 
 static size_t
-ext2_xattr_user_list(char *list, struct inode *inode,
-		     const char *name, int name_len)
+ext2_xattr_user_list(struct inode *inode, char *list, size_t list_size,
+		     const char *name, size_t name_len)
 {
-	const int prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+	const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+	const size_t total_len = prefix_len + name_len + 1;
 
 	if (!test_opt(inode->i_sb, XATTR_USER))
 		return 0;
 
-	if (list) {
+	if (list && total_len <= list_size) {
 		memcpy(list, XATTR_USER_PREFIX, prefix_len);
 		memcpy(list+prefix_len, name, name_len);
 		list[prefix_len + name_len] = '\0';
 	}
-	return prefix_len + name_len + 1;
+	return total_len;
 }
 
 static int
@@ -68,23 +69,9 @@ ext2_xattr_user_set(struct inode *inode, const char *name,
 			      value, size, flags);
 }
 
-struct ext2_xattr_handler ext2_xattr_user_handler = {
+struct xattr_handler ext2_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
 	.list	= ext2_xattr_user_list,
 	.get	= ext2_xattr_user_get,
 	.set	= ext2_xattr_user_set,
 };
-
-int __init
-init_ext2_xattr_user(void)
-{
-	return ext2_xattr_register(EXT2_XATTR_INDEX_USER,
-				   &ext2_xattr_user_handler);
-}
-
-void
-exit_ext2_xattr_user(void)
-{
-	ext2_xattr_unregister(EXT2_XATTR_INDEX_USER,
-			      &ext2_xattr_user_handler);
-}
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index a3cf77de0e43..4a18653e831d 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -285,60 +285,24 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
 	return error;
 }
 
-/*
- * Inode operation permission().
- *
- * inode->i_sem: don't care
- */
-int
-ext3_permission(struct inode *inode, int mask, struct nameidata *nd)
+static int
+ext3_check_acl(struct inode *inode, int mask)
 {
-	int mode = inode->i_mode;
-
-	/* Nobody gets write access to a read-only fs */
-	if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
-	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
-		return -EROFS;
-	/* Nobody gets write access to an immutable file */
-	if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
-	    return -EACCES;
-	if (current->fsuid == inode->i_uid) {
-		mode >>= 6;
-	} else if (test_opt(inode->i_sb, POSIX_ACL)) {
-		struct posix_acl *acl;
-
-		/* The access ACL cannot grant access if the group class
-		   permission bits don't contain all requested permissions. */
-		if (((mode >> 3) & mask & S_IRWXO) != mask)
-			goto check_groups;
-		acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
-		if (acl) {
-			int error = posix_acl_permission(inode, acl, mask);
-			posix_acl_release(acl);
-			if (error == -EACCES)
-				goto check_capabilities;
-			return error;
-		} else
-			goto check_groups;
-	} else {
-check_groups:
-		if (in_group_p(inode->i_gid))
-			mode >>= 3;
+	struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
+
+	if (acl) {
+		int error = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+		return error;
 	}
-	if ((mode & mask & S_IRWXO) == mask)
-		return 0;
 
-check_capabilities:
-	/* Allowed to override Discretionary Access Control? */
-	if (!(mask & MAY_EXEC) ||
-	    (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
-		if (capable(CAP_DAC_OVERRIDE))
-			return 0;
-	/* Read and search granted if capable(CAP_DAC_READ_SEARCH) */
-	if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
-	    (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
-		return 0;
-	return -EACCES;
+	return -EAGAIN;
+}
+
+int
+ext3_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+	return generic_permission(inode, mask, ext3_check_acl);
 }
 
 /*
@@ -452,27 +416,27 @@ out:
  * Extended attribute handlers
  */
 static size_t
-ext3_xattr_list_acl_access(char *list, struct inode *inode,
-			   const char *name, int name_len)
+ext3_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
+			   const char *name, size_t name_len)
 {
 	const size_t size = sizeof(XATTR_NAME_ACL_ACCESS);
 
 	if (!test_opt(inode->i_sb, POSIX_ACL))
 		return 0;
-	if (list)
+	if (list && size <= list_len)
 		memcpy(list, XATTR_NAME_ACL_ACCESS, size);
 	return size;
 }
 
 static size_t
-ext3_xattr_list_acl_default(char *list, struct inode *inode,
-			    const char *name, int name_len)
+ext3_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
+			    const char *name, size_t name_len)
 {
 	const size_t size = sizeof(XATTR_NAME_ACL_DEFAULT);
 
 	if (!test_opt(inode->i_sb, POSIX_ACL))
 		return 0;
-	if (list)
+	if (list && size <= list_len)
 		memcpy(list, XATTR_NAME_ACL_DEFAULT, size);
 	return size;
 }
@@ -572,45 +536,16 @@ ext3_xattr_set_acl_default(struct inode *inode, const char *name,
 	return ext3_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
 }
 
-struct ext3_xattr_handler ext3_xattr_acl_access_handler = {
+struct xattr_handler ext3_xattr_acl_access_handler = {
 	.prefix	= XATTR_NAME_ACL_ACCESS,
 	.list	= ext3_xattr_list_acl_access,
 	.get	= ext3_xattr_get_acl_access,
 	.set	= ext3_xattr_set_acl_access,
 };
 
-struct ext3_xattr_handler ext3_xattr_acl_default_handler = {
+struct xattr_handler ext3_xattr_acl_default_handler = {
 	.prefix	= XATTR_NAME_ACL_DEFAULT,
 	.list	= ext3_xattr_list_acl_default,
 	.get	= ext3_xattr_get_acl_default,
 	.set	= ext3_xattr_set_acl_default,
 };
-
-void
-exit_ext3_acl(void)
-{
-	ext3_xattr_unregister(EXT3_XATTR_INDEX_POSIX_ACL_ACCESS,
-			      &ext3_xattr_acl_access_handler);
-	ext3_xattr_unregister(EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT,
-			      &ext3_xattr_acl_default_handler);
-}
-
-int __init
-init_ext3_acl(void)
-{
-	int error;
-
-	error = ext3_xattr_register(EXT3_XATTR_INDEX_POSIX_ACL_ACCESS,
-				    &ext3_xattr_acl_access_handler);
-	if (error)
-		goto fail;
-	error = ext3_xattr_register(EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT,
-				    &ext3_xattr_acl_default_handler);
-	if (error)
-		goto fail;
-	return 0;
-
-fail:
-	exit_ext3_acl();
-	return error;
-}
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 50ba6861a247..fb39ecb44b3b 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -132,10 +132,12 @@ struct file_operations ext3_file_operations = {
 struct inode_operations ext3_file_inode_operations = {
 	.truncate	= ext3_truncate,
 	.setattr	= ext3_setattr,
-	.setxattr	= ext3_setxattr,
-	.getxattr	= ext3_getxattr,
+#ifdef CONFIG_EXT3_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= ext3_listxattr,
-	.removexattr	= ext3_removexattr,
+	.removexattr	= generic_removexattr,
+#endif
 	.permission	= ext3_permission,
 };
 
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 0e48f620d9e8..cf7225964a33 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1596,9 +1596,14 @@ out_stop:
 			if (end > inode->i_size) {
 				ei->i_disksize = end;
 				i_size_write(inode, end);
-				err = ext3_mark_inode_dirty(handle, inode);
-				if (!ret) 
-					ret = err;
+				/*
+				 * We're going to return a positive `ret'
+				 * here due to non-zero-length I/O, so there's
+				 * no way of reporting error returns from
+				 * ext3_mark_inode_dirty() to userspace.  So
+				 * ignore it.
+				 */
+				ext3_mark_inode_dirty(handle, inode);
 			}
 		}
 		err = ext3_journal_stop(handle);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index e7e287e19cfe..cf359ec81ed6 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2352,18 +2352,22 @@ struct inode_operations ext3_dir_inode_operations = {
 	.mknod		= ext3_mknod,
 	.rename		= ext3_rename,
 	.setattr	= ext3_setattr,
-	.setxattr	= ext3_setxattr,
-	.getxattr	= ext3_getxattr,
+#ifdef CONFIG_EXT3_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= ext3_listxattr,
-	.removexattr	= ext3_removexattr,
+	.removexattr	= generic_removexattr,
+#endif
 	.permission	= ext3_permission,
 };
 
 struct inode_operations ext3_special_inode_operations = {
 	.setattr	= ext3_setattr,
-	.setxattr	= ext3_setxattr,
-	.getxattr	= ext3_getxattr,
+#ifdef CONFIG_EXT3_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= ext3_listxattr,
-	.removexattr	= ext3_removexattr,
+	.removexattr	= generic_removexattr,
+#endif
 	.permission	= ext3_permission,
 }; 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index a93964e67a2c..64d8840a6b7d 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1474,6 +1474,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	 */
 	sb->s_op = &ext3_sops;
 	sb->s_export_op = &ext3_export_ops;
+	sb->s_xattr = ext3_xattr_handlers;
 #ifdef CONFIG_QUOTA
 	sb->s_qcop = &ext3_qctl_operations;
 	sb->dq_op = &ext3_quota_operations;
diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c
index 867f713a102c..8c3e72818fb0 100644
--- a/fs/ext3/symlink.c
+++ b/fs/ext3/symlink.c
@@ -34,17 +34,21 @@ struct inode_operations ext3_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
-	.setxattr	= ext3_setxattr,
-	.getxattr	= ext3_getxattr,
+#ifdef CONFIG_EXT3_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= ext3_listxattr,
-	.removexattr	= ext3_removexattr,
+	.removexattr	= generic_removexattr,
+#endif
 };
 
 struct inode_operations ext3_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= ext3_follow_link,
-	.setxattr	= ext3_setxattr,
-	.getxattr	= ext3_getxattr,
+#ifdef CONFIG_EXT3_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= ext3_listxattr,
-	.removexattr	= ext3_removexattr,
+	.removexattr	= generic_removexattr,
+#endif
 };
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index e6df2ba425d6..b06cd4de6830 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -7,6 +7,8 @@
  * Ext3 code with a lot of help from Eric Jarman <ejarman@acm.org>.
  * Extended attributes for symlinks and special files added per
  *  suggestion of Luka Renko <luka.renko@hermes.si>.
+ * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
+ *  Red Hat Inc.
  */
 
 /*
@@ -100,101 +102,40 @@ static void ext3_xattr_rehash(struct ext3_xattr_header *,
 			      struct ext3_xattr_entry *);
 
 static struct mb_cache *ext3_xattr_cache;
-static struct ext3_xattr_handler *ext3_xattr_handlers[EXT3_XATTR_INDEX_MAX];
-static rwlock_t ext3_handler_lock = RW_LOCK_UNLOCKED;
 
-int
-ext3_xattr_register(int name_index, struct ext3_xattr_handler *handler)
-{
-	int error = -EINVAL;
-
-	if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) {
-		write_lock(&ext3_handler_lock);
-		if (!ext3_xattr_handlers[name_index-1]) {
-			ext3_xattr_handlers[name_index-1] = handler;
-			error = 0;
-		}
-		write_unlock(&ext3_handler_lock);
-	}
-	return error;
-}
-
-void
-ext3_xattr_unregister(int name_index, struct ext3_xattr_handler *handler)
-{
-	if (name_index > 0 || name_index <= EXT3_XATTR_INDEX_MAX) {
-		write_lock(&ext3_handler_lock);
-		ext3_xattr_handlers[name_index-1] = NULL;
-		write_unlock(&ext3_handler_lock);
-	}
-}
-
-static inline const char *
-strcmp_prefix(const char *a, const char *a_prefix)
-{
-	while (*a_prefix && *a == *a_prefix) {
-		a++;
-		a_prefix++;
-	}
-	return *a_prefix ? NULL : a;
-}
+static struct xattr_handler *ext3_xattr_handler_map[EXT3_XATTR_INDEX_MAX] = {
+	[EXT3_XATTR_INDEX_USER]		     = &ext3_xattr_user_handler,
+#ifdef CONFIG_EXT3_FS_POSIX_ACL
+	[EXT3_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext3_xattr_acl_access_handler,
+	[EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext3_xattr_acl_default_handler,
+#endif
+	[EXT3_XATTR_INDEX_TRUSTED]	     = &ext3_xattr_trusted_handler,
+#ifdef CONFIG_EXT3_FS_SECURITY
+	[EXT3_XATTR_INDEX_SECURITY]	     = &ext3_xattr_security_handler,
+#endif
+};
 
-/*
- * Decode the extended attribute name, and translate it into
- * the name_index and name suffix.
- */
-static inline struct ext3_xattr_handler *
-ext3_xattr_resolve_name(const char **name)
-{
-	struct ext3_xattr_handler *handler = NULL;
-	int i;
-
-	if (!*name)
-		return NULL;
-	read_lock(&ext3_handler_lock);
-	for (i=0; i<EXT3_XATTR_INDEX_MAX; i++) {
-		if (ext3_xattr_handlers[i]) {
-			const char *n = strcmp_prefix(*name,
-				ext3_xattr_handlers[i]->prefix);
-			if (n) {
-				handler = ext3_xattr_handlers[i];
-				*name = n;
-				break;
-			}
-		}
-	}
-	read_unlock(&ext3_handler_lock);
-	return handler;
-}
+struct xattr_handler *ext3_xattr_handlers[] = {
+	&ext3_xattr_user_handler,
+	&ext3_xattr_trusted_handler,
+#ifdef CONFIG_EXT3_FS_POSIX_ACL
+	&ext3_xattr_acl_access_handler,
+	&ext3_xattr_acl_default_handler,
+#endif
+#ifdef CONFIG_EXT3_FS_SECURITY
+	&ext3_xattr_security_handler,
+#endif
+	NULL
+};
 
-static inline struct ext3_xattr_handler *
+static inline struct xattr_handler *
 ext3_xattr_handler(int name_index)
 {
-	struct ext3_xattr_handler *handler = NULL;
-	if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) {
-		read_lock(&ext3_handler_lock);
-		handler = ext3_xattr_handlers[name_index-1];
-		read_unlock(&ext3_handler_lock);
-	}
-	return handler;
-}
-
-/*
- * Inode operation getxattr()
- *
- * dentry->d_inode->i_sem: don't care
- */
-ssize_t
-ext3_getxattr(struct dentry *dentry, const char *name,
-	      void *buffer, size_t size)
-{
-	struct ext3_xattr_handler *handler;
-	struct inode *inode = dentry->d_inode;
+	struct xattr_handler *handler = NULL;
 
-	handler = ext3_xattr_resolve_name(&name);
-	if (!handler)
-		return -EOPNOTSUPP;
-	return handler->get(inode, name, buffer, size);
+	if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX)
+		handler = ext3_xattr_handler_map[name_index];
+	return handler;
 }
 
 /*
@@ -209,43 +150,6 @@ ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
 }
 
 /*
- * Inode operation setxattr()
- *
- * dentry->d_inode->i_sem: down
- */
-int
-ext3_setxattr(struct dentry *dentry, const char *name,
-	      const void *value, size_t size, int flags)
-{
-	struct ext3_xattr_handler *handler;
-	struct inode *inode = dentry->d_inode;
-
-	if (size == 0)
-		value = "";  /* empty EA, do not remove */
-	handler = ext3_xattr_resolve_name(&name);
-	if (!handler)
-		return -EOPNOTSUPP;
-	return handler->set(inode, name, value, size, flags);
-}
-
-/*
- * Inode operation removexattr()
- *
- * dentry->d_inode->i_sem: down
- */
-int
-ext3_removexattr(struct dentry *dentry, const char *name)
-{
-	struct ext3_xattr_handler *handler;
-	struct inode *inode = dentry->d_inode;
-
-	handler = ext3_xattr_resolve_name(&name);
-	if (!handler)
-		return -EOPNOTSUPP;
-	return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
-}
-
-/*
  * ext3_xattr_get()
  *
  * Copy an extended attribute into the buffer
@@ -363,8 +267,8 @@ ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
 {
 	struct buffer_head *bh = NULL;
 	struct ext3_xattr_entry *entry;
-	size_t size = 0;
-	char *buf, *end;
+	char *end;
+	size_t rest = buffer_size;
 	int error;
 
 	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
@@ -390,44 +294,40 @@ bad_block:	ext3_error(inode->i_sb, "ext3_xattr_list",
 		error = -EIO;
 		goto cleanup;
 	}
-	/* compute the size required for the list of attribute names */
-	for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
-	     entry = EXT3_XATTR_NEXT(entry)) {
-		struct ext3_xattr_handler *handler;
-		struct ext3_xattr_entry *next =
-			EXT3_XATTR_NEXT(entry);
+
+	/* check the on-disk data structure */
+	entry = FIRST_ENTRY(bh);
+	while (!IS_LAST_ENTRY(entry)) {
+		struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(entry);
+
 		if ((char *)next >= end)
 			goto bad_block;
-
-		handler = ext3_xattr_handler(entry->e_name_index);
-		if (handler)
-			size += handler->list(NULL, inode, entry->e_name,
-					      entry->e_name_len);
+		entry = next;
 	}
-
 	if (ext3_xattr_cache_insert(bh))
 		ea_idebug(inode, "cache insert failed");
-	if (!buffer) {
-		error = size;
-		goto cleanup;
-	} else {
-		error = -ERANGE;
-		if (size > buffer_size)
-			goto cleanup;
-	}
 
 	/* list the attribute names */
-	buf = buffer;
 	for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
 	     entry = EXT3_XATTR_NEXT(entry)) {
-		struct ext3_xattr_handler *handler;
-
-		handler = ext3_xattr_handler(entry->e_name_index);
-		if (handler)
-			buf += handler->list(buf, inode, entry->e_name,
-					     entry->e_name_len);
+		struct xattr_handler *handler =
+			ext3_xattr_handler(entry->e_name_index);
+
+		if (handler) {
+			size_t size = handler->list(inode, buffer, rest,
+						    entry->e_name,
+						    entry->e_name_len);
+			if (buffer) {
+				if (size > rest) {
+					error = -ERANGE;
+					goto cleanup;
+				}
+				buffer += size;
+			}
+			rest -= size;
+		}
 	}
-	error = size;
+	error = buffer_size - rest;  /* total size */
 
 cleanup:
 	brelse(bh);
@@ -1179,51 +1079,12 @@ static void ext3_xattr_rehash(struct ext3_xattr_header *header,
 int __init
 init_ext3_xattr(void)
 {
-	int	err;
-
-	err = ext3_xattr_register(EXT3_XATTR_INDEX_USER,
-				  &ext3_xattr_user_handler);
-	if (err)
-		return err;
-	err = ext3_xattr_register(EXT3_XATTR_INDEX_TRUSTED,
-				  &ext3_xattr_trusted_handler);
-	if (err)
-		goto out;
-#ifdef CONFIG_EXT3_FS_SECURITY
-	err = ext3_xattr_register(EXT3_XATTR_INDEX_SECURITY,
-				  &ext3_xattr_security_handler);
-	if (err)
-		goto out1;
-#endif
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-	err = init_ext3_acl();
-	if (err)
-		goto out2;
-#endif
 	ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL,
 		sizeof(struct mb_cache_entry) +
 		sizeof(struct mb_cache_entry_index), 1, 6);
-	if (!ext3_xattr_cache) {
-		err = -ENOMEM;
-		goto out3;
-	}
+	if (!ext3_xattr_cache)
+		return -ENOMEM;
 	return 0;
-out3:
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-	exit_ext3_acl();
-out2:
-#endif
-#ifdef CONFIG_EXT3_FS_SECURITY
-	ext3_xattr_unregister(EXT3_XATTR_INDEX_SECURITY,
-			      &ext3_xattr_security_handler);
-out1:
-#endif
-	ext3_xattr_unregister(EXT3_XATTR_INDEX_TRUSTED,
-			      &ext3_xattr_trusted_handler);
-out:
-	ext3_xattr_unregister(EXT3_XATTR_INDEX_USER,
-			      &ext3_xattr_user_handler);
-	return err;
 }
 
 void
@@ -1232,15 +1093,4 @@ exit_ext3_xattr(void)
 	if (ext3_xattr_cache)
 		mb_cache_destroy(ext3_xattr_cache);
 	ext3_xattr_cache = NULL;
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-	exit_ext3_acl();
-#endif
-#ifdef CONFIG_EXT3_FS_SECURITY
-	ext3_xattr_unregister(EXT3_XATTR_INDEX_SECURITY,
-			      &ext3_xattr_security_handler);
-#endif
-	ext3_xattr_unregister(EXT3_XATTR_INDEX_TRUSTED,
-			      &ext3_xattr_trusted_handler);
-	ext3_xattr_unregister(EXT3_XATTR_INDEX_USER,
-			      &ext3_xattr_user_handler);
 }
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 2aabe9f591d2..6ca51c0e65fa 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -56,23 +56,13 @@ struct ext3_xattr_entry {
 
 # ifdef CONFIG_EXT3_FS_XATTR
 
-struct ext3_xattr_handler {
-	char *prefix;
-	size_t (*list)(char *list, struct inode *inode, const char *name,
-		       int name_len);
-	int (*get)(struct inode *inode, const char *name, void *buffer,
-		   size_t size);
-	int (*set)(struct inode *inode, const char *name, const void *buffer,
-		   size_t size, int flags);
-};
-
-extern int ext3_xattr_register(int, struct ext3_xattr_handler *);
-extern void ext3_xattr_unregister(int, struct ext3_xattr_handler *);
+extern struct xattr_handler ext3_xattr_user_handler;
+extern struct xattr_handler ext3_xattr_trusted_handler;
+extern struct xattr_handler ext3_xattr_acl_access_handler;
+extern struct xattr_handler ext3_xattr_acl_default_handler;
+extern struct xattr_handler ext3_xattr_security_handler;
 
-extern int ext3_setxattr(struct dentry *, const char *, const void *, size_t, int);
-extern ssize_t ext3_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
-extern int ext3_removexattr(struct dentry *, const char *);
 
 extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t);
 extern int ext3_xattr_list(struct inode *, char *, size_t);
@@ -85,11 +75,9 @@ extern void ext3_xattr_put_super(struct super_block *);
 extern int init_ext3_xattr(void);
 extern void exit_ext3_xattr(void);
 
+extern struct xattr_handler *ext3_xattr_handlers[];
+
 # else  /* CONFIG_EXT3_FS_XATTR */
-#  define ext3_setxattr		NULL
-#  define ext3_getxattr		NULL
-#  define ext3_listxattr	NULL
-#  define ext3_removexattr	NULL
 
 static inline int
 ext3_xattr_get(struct inode *inode, int name_index, const char *name,
@@ -139,8 +127,6 @@ exit_ext3_xattr(void)
 {
 }
 
-# endif  /* CONFIG_EXT3_FS_XATTR */
+#define ext3_xattr_handlers	NULL
 
-extern struct ext3_xattr_handler ext3_xattr_user_handler;
-extern struct ext3_xattr_handler ext3_xattr_trusted_handler;
-extern struct ext3_xattr_handler ext3_xattr_security_handler;
+# endif  /* CONFIG_EXT3_FS_XATTR */
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index c948effaa257..ddc1c41750e1 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -12,17 +12,19 @@
 #include "xattr.h"
 
 static size_t
-ext3_xattr_security_list(char *list, struct inode *inode,
-		    const char *name, int name_len)
+ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size,
+			 const char *name, size_t name_len)
 {
-	const int prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
+	const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
+	const size_t total_len = prefix_len + name_len + 1;
 
-	if (list) {
+
+	if (list && total_len <= list_size) {
 		memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
 		memcpy(list+prefix_len, name, name_len);
 		list[prefix_len + name_len] = '\0';
 	}
-	return prefix_len + name_len + 1;
+	return total_len;
 }
 
 static int
@@ -45,7 +47,7 @@ ext3_xattr_security_set(struct inode *inode, const char *name,
 			      value, size, flags);
 }
 
-struct ext3_xattr_handler ext3_xattr_security_handler = {
+struct xattr_handler ext3_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.list	= ext3_xattr_security_list,
 	.get	= ext3_xattr_security_get,
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index 45b1549c4491..f68bfd1cf519 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -16,20 +16,21 @@
 #define XATTR_TRUSTED_PREFIX "trusted."
 
 static size_t
-ext3_xattr_trusted_list(char *list, struct inode *inode,
-			const char *name, int name_len)
+ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
+			const char *name, size_t name_len)
 {
-	const int prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+	const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+	const size_t total_len = prefix_len + name_len + 1;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return 0;
 
-	if (list) {
+	if (list && total_len <= list_size) {
 		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
 		memcpy(list+prefix_len, name, name_len);
 		list[prefix_len + name_len] = '\0';
 	}
-	return prefix_len + name_len + 1;
+	return total_len;
 }
 
 static int
@@ -56,7 +57,7 @@ ext3_xattr_trusted_set(struct inode *inode, const char *name,
 			      value, size, flags);
 }
 
-struct ext3_xattr_handler ext3_xattr_trusted_handler = {
+struct xattr_handler ext3_xattr_trusted_handler = {
 	.prefix	= XATTR_TRUSTED_PREFIX,
 	.list	= ext3_xattr_trusted_list,
 	.get	= ext3_xattr_trusted_get,
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 84877afff67a..e907cae7a07c 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -16,20 +16,21 @@
 #define XATTR_USER_PREFIX "user."
 
 static size_t
-ext3_xattr_user_list(char *list, struct inode *inode,
-		     const char *name, int name_len)
+ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size,
+		     const char *name, size_t name_len)
 {
-	const int prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+	const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+	const size_t total_len = prefix_len + name_len + 1;
 
 	if (!test_opt(inode->i_sb, XATTR_USER))
 		return 0;
 
-	if (list) {
+	if (list && total_len <= list_size) {
 		memcpy(list, XATTR_USER_PREFIX, prefix_len);
 		memcpy(list+prefix_len, name, name_len);
 		list[prefix_len + name_len] = '\0';
 	}
-	return prefix_len + name_len + 1;
+	return total_len;
 }
 
 static int
@@ -70,7 +71,7 @@ ext3_xattr_user_set(struct inode *inode, const char *name,
 			      value, size, flags);
 }
 
-struct ext3_xattr_handler ext3_xattr_user_handler = {
+struct xattr_handler ext3_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
 	.list	= ext3_xattr_user_list,
 	.get	= ext3_xattr_user_get,
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ee380e7b9569..da78d7bba880 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -291,8 +291,6 @@ void f_delown(struct file *filp)
 	f_modown(filp, 0, 0, 0, 1);
 }
 
-EXPORT_SYMBOL(f_delown);
-
 static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 		struct file *filp)
 {
diff --git a/fs/file_table.c b/fs/file_table.c
index 3750a140ef43..e9ce5279ef6d 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -24,11 +24,9 @@ struct files_stat_struct files_stat = {
 
 EXPORT_SYMBOL(files_stat); /* Needed by unix.o */
 
-/* public *and* exported. Not pretty! */
+/* public. Not pretty! */
 spinlock_t __cacheline_aligned_in_smp files_lock = SPIN_LOCK_UNLOCKED;
 
-EXPORT_SYMBOL(files_lock);
-
 static spinlock_t filp_count_lock = SPIN_LOCK_UNLOCKED;
 
 /* slab constructors and destructors are called from arbitrary
@@ -199,8 +197,6 @@ void put_filp(struct file *file)
 	}
 }
 
-EXPORT_SYMBOL(put_filp);
-
 void file_move(struct file *file, struct list_head *list)
 {
 	if (!list)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index da522d511f2d..969e9b0e0afc 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -244,6 +244,8 @@ static int
 __writeback_single_inode(struct inode *inode,
 			struct writeback_control *wbc)
 {
+	wait_queue_head_t *wqh;
+
 	if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_LOCK)) {
 		list_move(&inode->i_list, &inode->i_sb->s_dirty);
 		return 0;
@@ -252,12 +254,18 @@ __writeback_single_inode(struct inode *inode,
 	/*
 	 * It's a data-integrity sync.  We must wait.
 	 */
-	while (inode->i_state & I_LOCK) {
-		__iget(inode);
-		spin_unlock(&inode_lock);
-		__wait_on_inode(inode);
-		iput(inode);
-		spin_lock(&inode_lock);
+	if (inode->i_state & I_LOCK) {
+		DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LOCK);
+
+		wqh = bit_waitqueue(&inode->i_state, __I_LOCK);
+		do {
+			__iget(inode);
+			spin_unlock(&inode_lock);
+			__wait_on_bit(wqh, &wq, inode_wait,
+							TASK_UNINTERRUPTIBLE);
+			iput(inode);
+			spin_lock(&inode_lock);
+		} while (inode->i_state & I_LOCK);
 	}
 	return __sync_single_inode(inode, wbc);
 }
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 6c869f377964..ae55ce1a5405 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -517,7 +517,7 @@ static int hfs_permission(struct inode *inode, int mask,
 {
 	if (S_ISREG(inode->i_mode) && mask & MAY_EXEC)
 		return 0;
-	return vfs_permission(inode, mask);
+	return generic_permission(inode, mask, NULL);
 }
 
 static int hfs_file_open(struct inode *inode, struct file *file)
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index eff1c987b6fb..f58025cb38af 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -260,7 +260,7 @@ static int hfsplus_permission(struct inode *inode, int mask, struct nameidata *n
 	 */
 	if (S_ISREG(inode->i_mode) && mask & MAY_EXEC && !(inode->i_mode & 0111))
 		return 0;
-	return vfs_permission(inode, mask);
+	return generic_permission(inode, mask, NULL);
 }
 
 
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 7d42da0a304d..937fac26b34a 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -808,7 +808,7 @@ int hostfs_permission(struct inode *ino, int desired, struct nameidata *nd)
 	if(name == NULL) return(-ENOMEM);
 	err = access_file(name, r, w, x);
 	kfree(name);
-	if(!err) err = vfs_permission(ino, desired);
+	if(!err) err = generic_permission(ino, desired, NULL);
 	return(err);
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index 8cd74200bdff..1fa7de5e8f84 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1264,37 +1264,10 @@ void remove_dquot_ref(struct super_block *sb, int type, struct list_head *tofree
 
 #endif
 
-/*
- * Hashed waitqueues for wait_on_inode().  The table is pretty small - the
- * kernel doesn't lock many inodes at the same time.
- */
-#define I_WAIT_TABLE_ORDER	3
-static struct i_wait_queue_head {
-	wait_queue_head_t wqh;
-} ____cacheline_aligned_in_smp i_wait_queue_heads[1<<I_WAIT_TABLE_ORDER];
-
-/*
- * Return the address of the waitqueue_head to be used for this inode
- */
-static wait_queue_head_t *i_waitq_head(struct inode *inode)
-{
-	return &i_wait_queue_heads[hash_ptr(inode, I_WAIT_TABLE_ORDER)].wqh;
-}
-
-void __wait_on_inode(struct inode *inode)
+int inode_wait(void *word)
 {
-	DECLARE_WAITQUEUE(wait, current);
-	wait_queue_head_t *wq = i_waitq_head(inode);
-
-	add_wait_queue(wq, &wait);
-repeat:
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	if (inode->i_state & I_LOCK) {
-		schedule();
-		goto repeat;
-	}
-	remove_wait_queue(wq, &wait);
-	__set_current_state(TASK_RUNNING);
+	schedule();
+	return 0;
 }
 
 /*
@@ -1303,36 +1276,39 @@ repeat:
  * that it isn't found.  This is because iget will immediately call
  * ->read_inode, and we want to be sure that evidence of the deletion is found
  * by ->read_inode.
- *
- * This call might return early if an inode which shares the waitq is woken up.
- * This is most easily handled by the caller which will loop around again
- * looking for the inode.
- *
  * This is called with inode_lock held.
  */
 static void __wait_on_freeing_inode(struct inode *inode)
 {
-	DECLARE_WAITQUEUE(wait, current);
-	wait_queue_head_t *wq = i_waitq_head(inode);
+	wait_queue_head_t *wq;
+	DEFINE_WAIT_BIT(wait, &inode->i_state, __I_LOCK);
 
-	add_wait_queue(wq, &wait);
-	set_current_state(TASK_UNINTERRUPTIBLE);
+	/*
+	 * I_FREEING and I_CLEAR are cleared in process context under
+	 * inode_lock, so we have to give the tasks who would clear them
+	 * a chance to run and acquire inode_lock.
+	 */
+	if (!(inode->i_state & I_LOCK)) {
+		spin_unlock(&inode_lock);
+		yield();
+		spin_lock(&inode_lock);
+		return;
+	}
+	wq = bit_waitqueue(&inode->i_state, __I_LOCK);
+	prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
 	spin_unlock(&inode_lock);
 	schedule();
-	remove_wait_queue(wq, &wait);
+	finish_wait(wq, &wait.wait);
 	spin_lock(&inode_lock);
 }
 
 void wake_up_inode(struct inode *inode)
 {
-	wait_queue_head_t *wq = i_waitq_head(inode);
-
 	/*
 	 * Prevent speculative execution through spin_unlock(&inode_lock);
 	 */
 	smp_mb();
-	if (waitqueue_active(wq))
-		wake_up_all(wq);
+	wake_up_bit(&inode->i_state, __I_LOCK);
 }
 
 static __initdata unsigned long ihash_entries;
@@ -1367,11 +1343,6 @@ void __init inode_init_early(void)
 
 void __init inode_init(unsigned long mempages)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++)
-		init_waitqueue_head(&i_wait_queue_heads[i].wqh);
-
 	/* inode slab cache */
 	inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode),
 				0, SLAB_PANIC, init_once, NULL);
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index f8a1dea56611..b4d6654ef7f2 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -579,7 +579,7 @@ wait_for_iobuf:
 		journal_file_buffer(jh, commit_transaction, BJ_Forget);
 		/* Wake up any transactions which were waiting for this
 		   IO to complete */
-		wake_up_buffer(bh);
+		wake_up_bit(&bh->b_state, BH_Unshadow);
 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
 		__brelse(bh);
 	}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 18a678ce2591..a168757d26af 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -633,21 +633,22 @@ repeat:
 		 * disk then we cannot do copy-out here. */
 
 		if (jh->b_jlist == BJ_Shadow) {
+			DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
 			wait_queue_head_t *wqh;
-			DEFINE_WAIT(wait);
+
+			wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
 
 			JBUFFER_TRACE(jh, "on shadow: sleep");
 			jbd_unlock_bh_state(bh);
 			/* commit wakes up all shadow buffers after IO */
-			wqh = bh_waitq_head(bh);
 			for ( ; ; ) {
-				prepare_to_wait(wqh, &wait,
+				prepare_to_wait(wqh, &wait.wait,
 						TASK_UNINTERRUPTIBLE);
 				if (jh->b_jlist != BJ_Shadow)
 					break;
 				schedule();
 			}
-			finish_wait(wqh, &wait);
+			finish_wait(wqh, &wait.wait);
 			goto repeat;
 		}
 
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index fc584f2194f4..8d2a9ab981d4 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -123,88 +123,25 @@ out:
 	return rc;
 }
 
-/*
- *	jfs_permission()
- *
- * modified vfs_permission to check posix acl
- */
-int jfs_permission(struct inode * inode, int mask, struct nameidata *nd)
+static int jfs_check_acl(struct inode *inode, int mask)
 {
-	umode_t mode = inode->i_mode;
 	struct jfs_inode_info *ji = JFS_IP(inode);
 
-	if (mask & MAY_WRITE) {
-		/*
-		 * Nobody gets write access to a read-only fs.
-		 */
-		if (IS_RDONLY(inode) &&
-		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
-			return -EROFS;
-
-		/*
-		 * Nobody gets write access to an immutable file.
-		 */
-		if (IS_IMMUTABLE(inode))
-			return -EACCES;
-	}
-
-	if (current->fsuid == inode->i_uid) {
-		mode >>= 6;
-		goto check_mode;
-	}
-	/*
-	 * ACL can't contain additional permissions if the ACL_MASK entry
-	 * is zero.
-	 */
-	if (!(mode & S_IRWXG))
-		goto check_groups;
-
 	if (ji->i_acl == JFS_ACL_NOT_CACHED) {
-		struct posix_acl *acl;
-
-		acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
-
+		struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
 		if (IS_ERR(acl))
 			return PTR_ERR(acl);
 		posix_acl_release(acl);
 	}
 
-	if (ji->i_acl) {
-		int rc = posix_acl_permission(inode, ji->i_acl, mask);
-		if (rc == -EACCES)
-			goto check_capabilities;
-		return rc;
-	}
-
-check_groups:
-	if (in_group_p(inode->i_gid))
-		mode >>= 3;
-
-check_mode:
-	/*
-	 * If the DACs are ok we don't need any capability check.
-	 */
-	if (((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask))
-		return 0;
+	if (ji->i_acl)
+		return posix_acl_permission(inode, ji->i_acl, mask);
+	return -EAGAIN;
+}
 
-check_capabilities:
-	/*
-	 * Read/write DACs are always overridable.
-	 * Executable DACs are overridable if at least one exec bit is set.
-	 */
-	if (!(mask & MAY_EXEC) ||
-	    (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
-		if (capable(CAP_DAC_OVERRIDE))
-			return 0;
-
-	/*
-	 * Searching includes executable on directories, else just read.
-	 */
-	if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
-		if (capable(CAP_DAC_READ_SEARCH))
-			return 0;
-
-	return -EACCES;
+int jfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+	return generic_permission(inode, mask, jfs_check_acl);
 }
 
 int jfs_init_acl(struct inode *inode, struct inode *dir)
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 7c91ccfe382f..ecf1bca73050 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -403,6 +403,10 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	sbi->flag = flag;
 
+#ifdef CONFIG_JFS_POSIX_ACL
+	sb->s_flags |= MS_POSIXACL;
+#endif
+
 	if (newLVSize) {
 		printk(KERN_ERR "resize option for remount only\n");
 		return -EINVAL;
diff --git a/fs/mbcache.c b/fs/mbcache.c
index dbc4443e6949..988161cb0a77 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -65,9 +65,7 @@ EXPORT_SYMBOL(mb_cache_destroy);
 EXPORT_SYMBOL(mb_cache_entry_alloc);
 EXPORT_SYMBOL(mb_cache_entry_insert);
 EXPORT_SYMBOL(mb_cache_entry_release);
-EXPORT_SYMBOL(mb_cache_entry_takeout);
 EXPORT_SYMBOL(mb_cache_entry_free);
-EXPORT_SYMBOL(mb_cache_entry_dup);
 EXPORT_SYMBOL(mb_cache_entry_get);
 #if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
 EXPORT_SYMBOL(mb_cache_entry_find_first);
@@ -456,23 +454,6 @@ mb_cache_entry_release(struct mb_cache_entry *ce)
 
 
 /*
- * mb_cache_entry_takeout()
- *
- * Take a cache entry out of the cache, making it invalid. The entry can later
- * be re-inserted using mb_cache_entry_insert(), or released using
- * mb_cache_entry_release().
- */
-void
-mb_cache_entry_takeout(struct mb_cache_entry *ce)
-{
-	spin_lock(&mb_cache_spinlock);
-	mb_assert(list_empty(&ce->e_lru_list));
-	__mb_cache_entry_unhash(ce);
-	spin_unlock(&mb_cache_spinlock);
-}
-
-
-/*
  * mb_cache_entry_free()
  *
  * This is equivalent to the sequence mb_cache_entry_takeout() --
@@ -489,20 +470,6 @@ mb_cache_entry_free(struct mb_cache_entry *ce)
 
 
 /*
- * mb_cache_entry_dup()
- *
- * Duplicate a handle to a cache entry (does not duplicate the cache entry
- * itself). After the call, both the old and the new handle must be released.
- */
-struct mb_cache_entry *
-mb_cache_entry_dup(struct mb_cache_entry *ce)
-{
-	atomic_inc(&ce->e_used);
-	return ce;
-}
-
-
-/*
  * mb_cache_entry_get()
  *
  * Get a cache entry  by device / block number. (There can only be one entry
diff --git a/fs/namei.c b/fs/namei.c
index b00bcaa91ee2..287339bb7b7c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -152,15 +152,19 @@ char * getname(const char __user * filename)
 	return result;
 }
 
-/*
- *	vfs_permission()
+/**
+ * generic_permission  -  check for access rights on a Posix-like filesystem
+ * @inode:	inode to check access rights for
+ * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ * @check_acl:	optional callback to check for Posix ACLs
  *
- * is used to check for read/write/execute permissions on a file.
+ * Used to check for read/write/execute permissions on a file.
  * We use "fsuid" for this, letting us set arbitrary permissions
  * for filesystem access without changing the "normal" uids which
  * are used for other things..
  */
-int vfs_permission(struct inode * inode, int mask)
+int generic_permission(struct inode *inode, int mask,
+		int (*check_acl)(struct inode *inode, int mask))
 {
 	umode_t			mode = inode->i_mode;
 
@@ -181,8 +185,18 @@ int vfs_permission(struct inode * inode, int mask)
 
 	if (current->fsuid == inode->i_uid)
 		mode >>= 6;
-	else if (in_group_p(inode->i_gid))
-		mode >>= 3;
+	else {
+		if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
+			int error = check_acl(inode, mask);
+			if (error == -EACCES)
+				goto check_capabilities;
+			else if (error != -EAGAIN)
+				return error;
+		}
+
+		if (in_group_p(inode->i_gid))
+			mode >>= 3;
+	}
 
 	/*
 	 * If the DACs are ok we don't need any capability check.
@@ -190,6 +204,7 @@ int vfs_permission(struct inode * inode, int mask)
 	if (((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask))
 		return 0;
 
+ check_capabilities:
 	/*
 	 * Read/write DACs are always overridable.
 	 * Executable DACs are overridable if at least one exec bit is set.
@@ -220,7 +235,7 @@ int permission(struct inode * inode,int mask, struct nameidata *nd)
 	if (inode->i_op && inode->i_op->permission)
 		retval = inode->i_op->permission(inode, submask, nd);
 	else
-		retval = vfs_permission(inode, submask);
+		retval = generic_permission(inode, submask, NULL);
 	if (retval)
 		return retval;
 
@@ -315,7 +330,7 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name,
 /*
  * Short-cut version of permission(), for calling by
  * path_walk(), when dcache lock is held.  Combines parts
- * of permission() and vfs_permission(), and tests ONLY for
+ * of permission() and generic_permission(), and tests ONLY for
  * MAY_EXEC permission.
  *
  * If appropriate, check DAC only.  If not appropriate, or
@@ -2438,7 +2453,6 @@ EXPORT_SYMBOL(follow_up);
 EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
 EXPORT_SYMBOL(getname);
 EXPORT_SYMBOL(lock_rename);
-EXPORT_SYMBOL(lookup_create);
 EXPORT_SYMBOL(lookup_hash);
 EXPORT_SYMBOL(lookup_one_len);
 EXPORT_SYMBOL(page_follow_link);
@@ -2457,7 +2471,7 @@ EXPORT_SYMBOL(vfs_follow_link);
 EXPORT_SYMBOL(vfs_link);
 EXPORT_SYMBOL(vfs_mkdir);
 EXPORT_SYMBOL(vfs_mknod);
-EXPORT_SYMBOL(vfs_permission);
+EXPORT_SYMBOL(generic_permission);
 EXPORT_SYMBOL(vfs_readlink);
 EXPORT_SYMBOL(vfs_rename);
 EXPORT_SYMBOL(vfs_rmdir);
diff --git a/fs/namespace.c b/fs/namespace.c
index 961b6ae00458..b09b08807f2c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1209,8 +1209,6 @@ void set_fs_root(struct fs_struct *fs, struct vfsmount *mnt,
 	}
 }
 
-EXPORT_SYMBOL(set_fs_root);
-
 /*
  * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
  * It can block. Requires the big lock held.
@@ -1234,8 +1232,6 @@ void set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
 	}
 }
 
-EXPORT_SYMBOL(set_fs_pwd);
-
 static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd)
 {
 	struct task_struct *g, *p;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 626252aec95a..eac30582ff9e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1600,7 +1600,7 @@ int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
 	return res;
 out_notsup:
 	nfs_revalidate_inode(NFS_SERVER(inode), inode);
-	res = vfs_permission(inode, mask);
+	res = generic_permission(inode, mask, NULL);
 	unlock_kernel();
 	return res;
 }
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 0302c351217f..b42f7bc9592d 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -21,7 +21,129 @@ ToDo/Notes:
 	- Enable the code for setting the NT4 compatibility flag when we start
 	  making NTFS 1.2 specific modifications.
 
-2.1.20 - Fix a stupid bug in ntfs_attr_reinit_search_ctx().
+2.1.21 - Fix some races and bugs, rewrite mft write code, add mft allocator.
+
+	- Implement extent mft record deallocation
+	  fs/ntfs/mft.c::ntfs_extent_mft_record_free().
+	- Splitt runlist related functions off from attrib.[hc] to runlist.[hc].
+	- Add vol->mft_data_pos and initialize it at mount time.
+	- Rename init_runlist() to ntfs_init_runlist(), ntfs_vcn_to_lcn() to
+	  ntfs_rl_vcn_to_lcn(), decompress_mapping_pairs() to
+	  ntfs_mapping_pairs_decompress(), ntfs_merge_runlists() to
+	  ntfs_runlists_merge() and adapt all callers.
+	- Add fs/ntfs/runlist.[hc]::ntfs_get_nr_significant_bytes(),
+	  ntfs_get_size_for_mapping_pairs(), ntfs_write_significant_bytes(),
+	  and ntfs_mapping_pairs_build(), adapted from libntfs.
+	- Make fs/ntfs/lcnalloc.c::ntfs_cluster_free_from_rl_nolock() not
+	  static and add a declaration for it to lcnalloc.h.
+	- Add fs/ntfs/lcnalloc.h::ntfs_cluster_free_from_rl() which is a static
+	  inline wrapper for ntfs_cluster_free_from_rl_nolock() which takes the
+	  cluster bitmap lock for the duration of the call.
+	- Add fs/ntfs/attrib.[hc]::ntfs_attr_record_resize().
+	- Implement the equivalent of memset() for an ntfs attribute in
+	  fs/ntfs/attrib.[hc]::ntfs_attr_set() and switch
+	  fs/ntfs/logfile.c::ntfs_empty_logfile() to using it.
+	- Remove unnecessary casts from LCN_* constants.
+	- Implement fs/ntfs/runlist.c::ntfs_rl_truncate_nolock().
+	- Add MFT_RECORD_OLD as a copy of MFT_RECORD in fs/ntfs/layout.h and
+	  change MFT_RECORD to contain the NTFS 3.1+ specific fields.
+	- Add a helper function fs/ntfs/aops.c::mark_ntfs_record_dirty() which
+	  marks all buffers belonging to an ntfs record dirty, followed by
+	  marking the page the ntfs record is in dirty and also marking the vfs
+	  inode containing the ntfs record dirty (I_DIRTY_PAGES).
+	- Switch fs/ntfs/index.h::ntfs_index_entry_mark_dirty() to using the
+	  new helper fs/ntfs/aops.c::mark_ntfs_record_dirty() and remove the no
+	  longer needed fs/ntfs/index.[hc]::__ntfs_index_entry_mark_dirty().
+	- Move ntfs_{un,}map_page() from ntfs.h to aops.h and fix resulting
+	  include errors.
+	- Move the typedefs for runlist_element and runlist from types.h to
+	  runlist.h and fix resulting include errors.
+	- Remove unused {__,}format_mft_record() from fs/ntfs/mft.c.
+	- Modify fs/ntfs/mft.c::__mark_mft_record_dirty() to use the helper
+	  mark_ntfs_record_dirty() which also changes the behaviour in that we
+	  now set the buffers belonging to the mft record dirty as well as the
+	  page itself.
+	- Update fs/ntfs/mft.c::write_mft_record_nolock() and sync_mft_mirror()
+	  to cope with the fact that there now are dirty buffers in mft pages.
+	- Update fs/ntfs/inode.c::ntfs_write_inode() to also use the helper
+	  mark_ntfs_record_dirty() and thus to set the buffers belonging to the
+	  mft record dirty as well as the page itself.
+	- Fix compiler warnings on x86-64 in fs/ntfs/dir.c.  (Randy Dunlap,
+	  slightly modified by me)
+	- Add fs/ntfs/mft.c::try_map_mft_record() which fails with -EALREADY if
+	  the mft record is already locked and otherwise behaves the same way
+	  as fs/ntfs/mft.c::map_mft_record().
+	- Modify fs/ntfs/mft.c::write_mft_record_nolock() so that it only
+	  writes the mft record if the buffers belonging to it are dirty.
+	  Otherwise we assume that it was written out by other means already.
+	- Attempting to write outside initialized size is _not_ a bug so remove
+	  the bug check from fs/ntfs/aops.c::ntfs_write_mst_block().  It is in
+	  fact required to write outside initialized size when preparing to
+	  extend the initialized size.
+	- Map the page instead of using page_address() before writing to it in
+	  fs/ntfs/aops.c::ntfs_mft_writepage().
+	- Provide exclusion between opening an inode / mapping an mft record
+	  and accessing the mft record in fs/ntfs/mft.c::ntfs_mft_writepage()
+	  by setting the page not uptodate throughout ntfs_mft_writepage().
+	- Clear the page uptodate flag in fs/ntfs/aops.c::ntfs_write_mst_block()
+	  to ensure noone can see the page whilst the mst fixups are applied.
+	- Add the helper fs/ntfs/mft.c::ntfs_may_write_mft_record() which
+	  checks if an mft record may be written out safely obtaining any
+	  necessary locks in the process.  This is used by
+	  fs/ntfs/aops.c::ntfs_write_mst_block().
+	- Modify fs/ntfs/aops.c::ntfs_write_mst_block() to also work for
+	  writing mft records and improve its error handling in the process.
+	  Now if any of the records in the page fail to be written out, all
+	  other records will be written out instead of aborting completely.
+	- Remove ntfs_mft_aops and update all users to use ntfs_mst_aops.
+	- Modify fs/ntfs/inode.c::ntfs_read_locked_inode() to set the
+	  ntfs_mst_aops for all inodes which are NInoMstProtected() and
+	  ntfs_aops for all other inodes.
+	- Rename fs/ntfs/mft.c::sync_mft_mirror{,_umount}() to
+	  ntfs_sync_mft_mirror{,_umount}() and change their parameters so they
+	  no longer require an ntfs inode to be present.  Update all callers.
+	- Cleanup the error handling in fs/ntfs/mft.c::ntfs_sync_mft_mirror().
+	- Clear the page uptodate flag in fs/ntfs/mft.c::ntfs_sync_mft_mirror()
+	  to ensure noone can see the page whilst the mst fixups are applied.
+	- Remove the no longer needed fs/ntfs/mft.c::ntfs_mft_writepage() and
+	  fs/ntfs/mft.c::try_map_mft_record().
+	- Fix callers of fs/ntfs/aops.c::mark_ntfs_record_dirty() to call it
+	  with the ntfs inode which contains the page rather than the ntfs
+	  inode the mft record of which is in the page.
+	- Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by moving the
+	  index inode bitmap inode release code from there to
+	  fs/ntfs/inode.c::ntfs_clear_big_inode().  (Thanks to Christoph
+	  Hellwig for spotting this.)
+	- Fix race condition in fs/ntfs/inode.c::ntfs_put_inode() by taking the
+	  inode semaphore around the code that sets ni->itype.index.bmp_ino to
+	  NULL and reorganize the code to optimize it a bit.  (Thanks to
+	  Christoph Hellwig for spotting this.)
+	- Modify fs/ntfs/aops.c::mark_ntfs_record_dirty() to no longer take the
+	  ntfs inode as a parameter as this is confusing and misleading and the
+	  needed ntfs inode is available via NTFS_I(page->mapping->host).
+	  Adapt all callers to this change.
+	- Modify fs/ntfs/mft.c::write_mft_record_nolock() and
+	  fs/ntfs/aops.c::ntfs_write_mst_block() to only check the dirty state
+	  of the first buffer in a record and to take this as the ntfs record
+	  dirty state.  We cannot look at the dirty state for subsequent
+	  buffers because we might be racing with
+	  fs/ntfs/aops.c::mark_ntfs_record_dirty().
+	- Move the static inline ntfs_init_big_inode() from fs/ntfs/inode.c to
+	  inode.h and make fs/ntfs/inode.c::__ntfs_init_inode() non-static and
+	  add a declaration for it to inode.h.  Fix some compilation issues
+	  that resulted due to #includes and header file interdependencies.
+	- Simplify setup of i_mode in fs/ntfs/inode.c::ntfs_read_locked_inode().
+	- Add helpers fs/ntfs/layout.h::MK_MREF() and MK_LE_MREF().
+	- Modify fs/ntfs/mft.c::map_extent_mft_record() to only verify the mft
+	  record sequence number if it is specified (i.e. not zero).
+	- Add fs/ntfs/mft.[hc]::ntfs_mft_record_alloc() and various helper
+	  functions used by it.
+	- Update Documentation/filesystems/ntfs.txt with instructions on how to
+	  use the Device-Mapper driver with NTFS ftdisk/LDM raid.  This removes
+	  the linear raid problem with the Software RAID / MD driver when one
+	  or more of the devices has an odd number of sectors.
+
+2.1.20 - Fix two stupid bugs introduced in 2.1.18 release.
 
 	- Fix stupid bug in fs/ntfs/attrib.c::ntfs_attr_reinit_search_ctx()
 	  where we did not clear ctx->al_entry but it was still set due to
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 28e8ac163037..99cac1cd4285 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -3,10 +3,10 @@
 obj-$(CONFIG_NTFS_FS) += ntfs.o
 
 ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
-	     index.o inode.o mft.o mst.o namei.o super.o sysctl.o unistr.o \
-	     upcase.o
+	     index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
+	     unistr.o upcase.o
 
-EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.20\"
+EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.21\"
 
 ifeq ($(CONFIG_NTFS_DEBUG),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index edcc9fbf6c09..101929b277a2 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -26,7 +26,15 @@
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/buffer_head.h>
-
+#include <linux/writeback.h>
+
+#include "aops.h"
+#include "attrib.h"
+#include "debug.h"
+#include "inode.h"
+#include "mft.h"
+#include "runlist.h"
+#include "types.h"
 #include "ntfs.h"
 
 /**
@@ -232,9 +240,9 @@ lock_retry_remap:
 				/* Seek to element containing target vcn. */
 				while (rl->length && rl[1].vcn <= vcn)
 					rl++;
-				lcn = ntfs_vcn_to_lcn(rl, vcn);
+				lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
 			} else
-				lcn = (LCN)LCN_RL_NOT_MAPPED;
+				lcn = LCN_RL_NOT_MAPPED;
 			/* Successful remap. */
 			if (lcn >= 0) {
 				/* Setup buffer head to correct block. */
@@ -266,7 +274,7 @@ lock_retry_remap:
 			}
 			/* Hard error, zero out region. */
 			SetPageError(page);
-			ntfs_error(vol->sb, "ntfs_vcn_to_lcn(vcn = 0x%llx) "
+			ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn(vcn = 0x%llx) "
 					"failed with error code 0x%llx%s.",
 					(unsigned long long)vcn,
 					(unsigned long long)-lcn,
@@ -274,9 +282,9 @@ lock_retry_remap:
 			// FIXME: Depending on vol->on_errors, do something.
 		}
 		/*
-		 * Either iblock was outside lblock limits or ntfs_vcn_to_lcn()
-		 * returned error. Just zero that portion of the page and set
-		 * the buffer uptodate.
+		 * Either iblock was outside lblock limits or
+		 * ntfs_rl_vcn_to_lcn() returned error.  Just zero that portion
+		 * of the page and set the buffer uptodate.
 		 */
 handle_hole:
 		bh->b_blocknr = -1UL;
@@ -637,9 +645,9 @@ lock_retry_remap:
 			/* Seek to element containing target vcn. */
 			while (rl->length && rl[1].vcn <= vcn)
 				rl++;
-			lcn = ntfs_vcn_to_lcn(rl, vcn);
+			lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
 		} else
-			lcn = (LCN)LCN_RL_NOT_MAPPED;
+			lcn = LCN_RL_NOT_MAPPED;
 		/* Successful remap. */
 		if (lcn >= 0) {
 			/* Setup buffer head to point to correct block. */
@@ -673,7 +681,7 @@ lock_retry_remap:
 		}
 		/* Failed to map the buffer, even after retrying. */
 		bh->b_blocknr = -1UL;
-		ntfs_error(vol->sb, "ntfs_vcn_to_lcn(vcn = 0x%llx) failed "
+		ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn(vcn = 0x%llx) failed "
 				"with error code 0x%llx%s.",
 				(unsigned long long)vcn,
 				(unsigned long long)-lcn,
@@ -772,25 +780,25 @@ lock_retry_remap:
 	return err;
 }
 
-static const char *ntfs_please_email = "Please email "
-		"linux-ntfs-dev@lists.sourceforge.net and say that you saw "
-		"this message.  Thank you.";
-
 /**
  * ntfs_write_mst_block - write a @page to the backing store
  * @wbc:	writeback control structure
  * @page:	page cache page to write out
  *
  * This function is for writing pages belonging to non-resident, mst protected
- * attributes to their backing store.  The only supported attribute is the
- * index allocation attribute.  Both directory inodes and index inodes are
- * supported.
+ * attributes to their backing store.  The only supported attributes are index
+ * allocation and $MFT/$DATA.  Both directory inodes and index inodes are
+ * supported for the index allocation case.
  *
  * The page must remain locked for the duration of the write because we apply
  * the mst fixups, write, and then undo the fixups, so if we were to unlock the
  * page before undoing the fixups, any other user of the page will see the
  * page contents as corrupt.
  *
+ * We clear the page uptodate flag for the duration of the function to ensure
+ * exclusion for the $MFT/$DATA case against someone mapping an mft record we
+ * are about to apply the mst fixups to.
+ *
  * Return 0 on success and -errno on error.
  *
  * Based on ntfs_write_block(), ntfs_mft_writepage(), and
@@ -805,60 +813,53 @@ static int ntfs_write_mst_block(struct writeback_control *wbc,
 	ntfs_volume *vol = ni->vol;
 	u8 *kaddr;
 	unsigned int bh_size = 1 << vi->i_blkbits;
-	unsigned int rec_size;
-	struct buffer_head *bh, *head;
+	unsigned int rec_size = ni->itype.index.block_size;
+	ntfs_inode *locked_nis[PAGE_CACHE_SIZE / rec_size];
+	struct buffer_head *bh, *head, *tbh;
 	int max_bhs = PAGE_CACHE_SIZE / bh_size;
 	struct buffer_head *bhs[max_bhs];
-	int i, nr_recs, nr_bhs, bhs_per_rec, err;
-	unsigned char bh_size_bits;
-	BOOL rec_is_dirty;
+	int i, nr_locked_nis, nr_recs, nr_bhs, bhs_per_rec, err;
+	unsigned char bh_size_bits, rec_size_bits;
+	BOOL sync, is_mft, page_is_dirty, rec_is_dirty;
 
 	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
 			"0x%lx.", vi->i_ino, ni->type, page->index);
 	BUG_ON(!NInoNonResident(ni));
 	BUG_ON(!NInoMstProtected(ni));
-	BUG_ON(!(S_ISDIR(vi->i_mode) ||
+	is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino);
+	BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) ||
 			(NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
-	BUG_ON(PageWriteback(page));
-	BUG_ON(!PageUptodate(page));
 	BUG_ON(!max_bhs);
 
+	/* Were we called for sync purposes? */
+	sync = (wbc->sync_mode == WB_SYNC_ALL);
+
 	/* Make sure we have mapped buffers. */
-	if (unlikely(!page_has_buffers(page))) {
-no_buffers_err_out:
-		ntfs_error(vol->sb, "Writing ntfs records without existing "
-				"buffers is not implemented yet.  %s",
-				ntfs_please_email);
-		err = -EOPNOTSUPP;
-		goto err_out;
-	}
+	BUG_ON(!page_has_buffers(page));
 	bh = head = page_buffers(page);
-	if (unlikely(!bh))
-		goto no_buffers_err_out;
+	BUG_ON(!bh);
 
 	bh_size_bits = vi->i_blkbits;
-	rec_size = ni->itype.index.block_size;
-	nr_recs = PAGE_CACHE_SIZE / rec_size;
-	BUG_ON(!nr_recs);
+	rec_size_bits = ni->itype.index.block_size_bits;
+	BUG_ON(!(PAGE_CACHE_SIZE >> rec_size_bits));
 	bhs_per_rec = rec_size >> bh_size_bits;
 	BUG_ON(!bhs_per_rec);
 
 	/* The first block in the page. */
-	rec_block = block = (s64)page->index <<
+	rec_block = block = (sector_t)page->index <<
 			(PAGE_CACHE_SHIFT - bh_size_bits);
 
 	/* The first out of bounds block for the data size. */
 	dblock = (vi->i_size + bh_size - 1) >> bh_size_bits;
 
-	err = nr_bhs = 0;
-	/* Need this to silence a stupid gcc warning. */
-	rec_is_dirty = FALSE;
+	err = nr_bhs = nr_recs = nr_locked_nis = 0;
+	page_is_dirty = rec_is_dirty = FALSE;
 	do {
 		if (unlikely(block >= dblock)) {
 			/*
 			 * Mapped buffers outside i_size will occur, because
 			 * this page can be outside i_size when there is a
-			 * truncate in progress. The contents of such buffers
+			 * truncate in progress.  The contents of such buffers
 			 * were zeroed by ntfs_writepage().
 			 *
 			 * FIXME: What about the small race window where
@@ -869,137 +870,235 @@ no_buffers_err_out:
 			clear_buffer_dirty(bh);
 			continue;
 		}
-		if (rec_block == block) {
+		if (likely(block < rec_block)) {
+			/*
+			 * This block is not the first one in the record.  We
+			 * ignore the buffer's dirty state because we could
+			 * have raced with a parallel mark_ntfs_record_dirty().
+			 */
+			if (!rec_is_dirty)
+				continue;
+		} else /* if (block == rec_block) */ {
+			BUG_ON(block > rec_block);
 			/* This block is the first one in the record. */
-			rec_block += rec_size >> bh_size_bits;
+			rec_block += bhs_per_rec;
 			if (!buffer_dirty(bh)) {
-				/* Clean buffers are not written out. */
+				/* Clean records are not written out. */
 				rec_is_dirty = FALSE;
 				continue;
 			}
 			rec_is_dirty = TRUE;
-		} else {
-			/* This block is not the first one in the record. */
-			if (!buffer_dirty(bh)) {
-				/* Clean buffers are not written out. */
-				BUG_ON(rec_is_dirty);
-				continue;
-			}
-			BUG_ON(!rec_is_dirty);
-		}
-		/* Attempting to write outside the initialized size is a bug. */
-		BUG_ON(((block + 1) << bh_size_bits) > ni->initialized_size);
-		if (!buffer_mapped(bh)) {
-			ntfs_error(vol->sb, "Writing ntfs records without "
-					"existing mapped buffers is not "
-					"implemented yet.  %s",
-					ntfs_please_email);
-			clear_buffer_dirty(bh);
-			err = -EOPNOTSUPP;
-			goto cleanup_out;
-		}
-		if (!buffer_uptodate(bh)) {
-			ntfs_error(vol->sb, "Writing ntfs records without "
-					"existing uptodate buffers is not "
-					"implemented yet.  %s",
-					ntfs_please_email);
-			clear_buffer_dirty(bh);
-			err = -EOPNOTSUPP;
-			goto cleanup_out;
 		}
+		BUG_ON(!buffer_mapped(bh));
+		BUG_ON(!buffer_uptodate(bh));
 		bhs[nr_bhs++] = bh;
 		BUG_ON(nr_bhs > max_bhs);
 	} while (block++, (bh = bh->b_this_page) != head);
 	/* If there were no dirty buffers, we are done. */
 	if (!nr_bhs)
 		goto done;
-	/* Apply the mst protection fixups. */
-	kaddr = page_address(page);
+	/* Map the page so we can access its contents. */
+	kaddr = kmap(page);
+	/* Clear the page uptodate flag whilst the mst fixups are applied. */
+	BUG_ON(!PageUptodate(page));
+	ClearPageUptodate(page);
 	for (i = 0; i < nr_bhs; i++) {
-		if (!(i % bhs_per_rec)) {
-			err = pre_write_mst_fixup((NTFS_RECORD*)(kaddr +
-					bh_offset(bhs[i])), rec_size);
-			if (err) {
-				ntfs_error(vol->sb, "Failed to apply mst "
-						"fixups (inode 0x%lx, "
-						"attribute type 0x%x, page "
-						"index 0x%lx)!  Umount and "
-						"run chkdsk.", vi->i_ino,
-						ni->type,
-				page->index);
-				nr_bhs = i;
-				goto mst_cleanup_out;
+		unsigned int ofs;
+
+		/* Skip buffers which are not at the beginning of records. */
+		if (i % bhs_per_rec)
+			continue;
+		tbh = bhs[i];
+		ofs = bh_offset(tbh);
+		if (is_mft) {
+			ntfs_inode *tni;
+			unsigned long mft_no;
+
+			/* Get the mft record number. */
+			mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
+					>> rec_size_bits;
+			/* Check whether to write this mft record. */
+			tni = NULL;
+			if (!ntfs_may_write_mft_record(vol, mft_no,
+					(MFT_RECORD*)(kaddr + ofs), &tni)) {
+				/*
+				 * The record should not be written.  This
+				 * means we need to redirty the page before
+				 * returning.
+				 */
+				page_is_dirty = TRUE;
+				/*
+				 * Remove the buffers in this mft record from
+				 * the list of buffers to write.
+				 */
+				do {
+					bhs[i] = NULL;
+				} while (++i % bhs_per_rec);
+				continue;
 			}
+			/*
+			 * The record should be written.  If a locked ntfs
+			 * inode was returned, add it to the array of locked
+			 * ntfs inodes.
+			 */
+			if (tni)
+				locked_nis[nr_locked_nis++] = tni;
 		}
+		/* Apply the mst protection fixups. */
+		err = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs),
+				rec_size);
+		if (unlikely(err)) {
+			ntfs_error(vol->sb, "Failed to apply mst fixups "
+					"(inode 0x%lx, attribute type 0x%x, "
+					"page index 0x%lx, page offset 0x%x)!"
+					"  Unmount and run chkdsk.", vi->i_ino,
+					ni->type, page->index, ofs);
+			/*
+			 * Mark all the buffers in this record clean as we do
+			 * not want to write corrupt data to disk.
+			 */
+			do {
+				clear_buffer_dirty(bhs[i]);
+				bhs[i] = NULL;
+			} while (++i % bhs_per_rec);
+			continue;
+		}
+		nr_recs++;
 	}
+	/* If no records are to be written out, we are done. */
+	if (!nr_recs)
+		goto unm_done;
 	flush_dcache_page(page);
 	/* Lock buffers and start synchronous write i/o on them. */
 	for (i = 0; i < nr_bhs; i++) {
-		struct buffer_head *tbh = bhs[i];
-
+		tbh = bhs[i];
+		if (!tbh)
+			continue;
 		if (unlikely(test_set_buffer_locked(tbh)))
 			BUG();
-		if (unlikely(!test_clear_buffer_dirty(tbh))) {
-			unlock_buffer(tbh);
-			continue;
-		}
+		/* The buffer dirty state is now irrelevant, just clean it. */
+		clear_buffer_dirty(tbh);
 		BUG_ON(!buffer_uptodate(tbh));
 		BUG_ON(!buffer_mapped(tbh));
 		get_bh(tbh);
 		tbh->b_end_io = end_buffer_write_sync;
 		submit_bh(WRITE, tbh);
 	}
+	/* Synchronize the mft mirror now if not @sync. */
+	if (is_mft && !sync)
+		goto do_mirror;
+do_wait:
 	/* Wait on i/o completion of buffers. */
 	for (i = 0; i < nr_bhs; i++) {
-		struct buffer_head *tbh = bhs[i];
-
+		tbh = bhs[i];
+		if (!tbh)
+			continue;
 		wait_on_buffer(tbh);
 		if (unlikely(!buffer_uptodate(tbh))) {
+			ntfs_error(vol->sb, "I/O error while writing ntfs "
+					"record buffer (inode 0x%lx, "
+					"attribute type 0x%x, page index "
+					"0x%lx, page offset 0x%lx)!  Unmount "
+					"and run chkdsk.", vi->i_ino, ni->type,
+					page->index, bh_offset(tbh));
 			err = -EIO;
 			/*
-			 * Set the buffer uptodate so the page & buffer states
-			 * don't become out of sync.
+			 * Set the buffer uptodate so the page and buffer
+			 * states do not become out of sync.
 			 */
-			if (PageUptodate(page))
-				set_buffer_uptodate(tbh);
+			set_buffer_uptodate(tbh);
 		}
 	}
+	/* If @sync, now synchronize the mft mirror. */
+	if (is_mft && sync) {
+do_mirror:
+		for (i = 0; i < nr_bhs; i++) {
+			unsigned long mft_no;
+			unsigned int ofs;
+
+			/*
+			 * Skip buffers which are not at the beginning of
+			 * records.
+			 */
+			if (i % bhs_per_rec)
+				continue;
+			tbh = bhs[i];
+			/* Skip removed buffers (and hence records). */
+			if (!tbh)
+				continue;
+			ofs = bh_offset(tbh);
+			/* Get the mft record number. */
+			mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
+					>> rec_size_bits;
+			if (mft_no < vol->mftmirr_size)
+				ntfs_sync_mft_mirror(vol, mft_no,
+						(MFT_RECORD*)(kaddr + ofs),
+						sync);
+		}
+		if (!sync)
+			goto do_wait;
+	}
 	/* Remove the mst protection fixups again. */
 	for (i = 0; i < nr_bhs; i++) {
-		if (!(i % bhs_per_rec))
+		if (!(i % bhs_per_rec)) {
+			tbh = bhs[i];
+			if (!tbh)
+				continue;
 			post_write_mst_fixup((NTFS_RECORD*)(kaddr +
-					bh_offset(bhs[i])));
+					bh_offset(tbh)));
+		}
 	}
 	flush_dcache_page(page);
+unm_done:
+	/* Unlock any locked inodes. */
+	while (nr_locked_nis-- > 0) {
+		ntfs_inode *tni, *base_tni;
+		
+		tni = locked_nis[nr_locked_nis];
+		/* Get the base inode. */
+		down(&tni->extent_lock);
+		if (tni->nr_extents >= 0)
+			base_tni = tni;
+		else {
+			base_tni = tni->ext.base_ntfs_ino;
+			BUG_ON(!base_tni);
+		}
+		up(&tni->extent_lock);
+		ntfs_debug("Unlocking %s inode 0x%lx.",
+				tni == base_tni ? "base" : "extent",
+				tni->mft_no);
+		up(&tni->mrec_lock);
+		atomic_dec(&tni->count);
+		iput(VFS_I(base_tni));
+	}
 	if (unlikely(err)) {
-		/* I/O error during writing.  This is really bad! */
-		ntfs_error(vol->sb, "I/O error while writing ntfs record "
-				"(inode 0x%lx, attribute type 0x%x, page "
-				"index 0x%lx)!  Umount and run chkdsk.",
-				vi->i_ino, ni->type, page->index);
-		goto err_out;
+		SetPageError(page);
+		NVolSetErrors(vol);
 	}
+	SetPageUptodate(page);
+	kunmap(page);
 done:
-	set_page_writeback(page);
-	unlock_page(page);
-	end_page_writeback(page);
-	if (!err)
+	if (page_is_dirty) {
+		ntfs_debug("Page still contains one or more dirty ntfs "
+				"records.  Redirtying the page starting at "
+				"record 0x%lx.", page->index <<
+				(PAGE_CACHE_SHIFT - rec_size_bits));
+		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
+	} else {
+		/*
+		 * Keep the VM happy.  This must be done otherwise the
+		 * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though
+		 * the page is clean.
+		 */
+		BUG_ON(PageWriteback(page));
+		set_page_writeback(page);
+		unlock_page(page);
+		end_page_writeback(page);
+	}
+	if (likely(!err))
 		ntfs_debug("Done.");
 	return err;
-mst_cleanup_out:
-	/* Remove the mst protection fixups again. */
-	for (i = 0; i < nr_bhs; i++) {
-		if (!(i % bhs_per_rec))
-			post_write_mst_fixup((NTFS_RECORD*)(kaddr +
-					bh_offset(bhs[i])));
-	}
-cleanup_out:
-	/* Clean the buffers. */
-	for (i = 0; i < nr_bhs; i++)
-		clear_buffer_dirty(bhs[i]);
-err_out:
-	SetPageError(page);
-	goto done;
 }
 
 /**
@@ -1007,6 +1106,9 @@ err_out:
  * @page:	page cache page to write out
  * @wbc:	writeback control structure
  *
+ * This is called from the VM when it wants to have a dirty ntfs page cache
+ * page cleaned.  The VM has already locked the page and marked it clean.
+ *
  * For non-resident attributes, ntfs_writepage() writes the @page by calling
  * the ntfs version of the generic block_write_full_page() function,
  * ntfs_write_block(), which in turn if necessary creates and writes the
@@ -1017,8 +1119,6 @@ err_out:
  * The mft record is then marked dirty and written out asynchronously via the
  * vfs inode dirty code path.
  *
- * Note the caller clears the page dirty flag before calling ntfs_writepage().
- *
  * Based on ntfs_readpage() and fs/buffer.c::block_write_full_page().
  *
  * Return 0 on success and -errno on error.
@@ -1402,9 +1502,9 @@ lock_retry_remap:
 				/* Seek to element containing target vcn. */
 				while (rl->length && rl[1].vcn <= vcn)
 					rl++;
-				lcn = ntfs_vcn_to_lcn(rl, vcn);
+				lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
 			} else
-				lcn = (LCN)LCN_RL_NOT_MAPPED;
+				lcn = LCN_RL_NOT_MAPPED;
 			if (unlikely(lcn < 0)) {
 				/*
 				 * We extended the attribute allocation above.
@@ -1451,7 +1551,7 @@ lock_retry_remap:
 				 * retrying.
 				 */
 				bh->b_blocknr = -1UL;
-				ntfs_error(vol->sb, "ntfs_vcn_to_lcn(vcn = "
+				ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn(vcn = "
 						"0x%llx) failed with error "
 						"code 0x%llx%s.",
 						(unsigned long long)vcn,
@@ -2028,3 +2128,46 @@ struct address_space_operations ntfs_mst_aops = {
 						   belonging to the page. */
 #endif /* NTFS_RW */
 };
+
+#ifdef NTFS_RW
+
+/**
+ * mark_ntfs_record_dirty - mark an ntfs record dirty
+ * @page:	page containing the ntfs record to mark dirty
+ * @ofs:	byte offset within @page at which the ntfs record begins
+ *
+ * If the ntfs record is the same size as the page cache page @page, set all
+ * buffers in the page dirty.  Otherwise, set only the buffers in which the
+ * ntfs record is located dirty.
+ *
+ * Also, set the page containing the ntfs record dirty, which also marks the
+ * vfs inode the ntfs record belongs to dirty (I_DIRTY_PAGES).
+ */
+void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
+	ntfs_inode *ni;
+	struct buffer_head *bh, *head;
+	unsigned int end, bh_size, bh_ofs;
+
+	BUG_ON(!page);
+	BUG_ON(!page_has_buffers(page));
+	ni = NTFS_I(page->mapping->host);
+	BUG_ON(!ni);
+	if (ni->itype.index.block_size == PAGE_CACHE_SIZE) {
+		__set_page_dirty_buffers(page);
+		return;
+	}
+	end = ofs + ni->itype.index.block_size;
+	bh_size = ni->vol->sb->s_blocksize;
+	bh = head = page_buffers(page);
+	do {
+		bh_ofs = bh_offset(bh);
+		if (bh_ofs + bh_size <= ofs)
+			continue;
+		if (unlikely(bh_ofs >= end))
+			break;
+		set_buffer_dirty(bh);
+	} while ((bh = bh->b_this_page) != head);
+	__set_page_dirty_nobuffers(page);
+}
+
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
new file mode 100644
index 000000000000..10b23174cb5f
--- /dev/null
+++ b/fs/ntfs/aops.h
@@ -0,0 +1,102 @@
+/**
+ * aops.h - Defines for NTFS kernel address space operations and page cache
+ *	    handling.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_AOPS_H
+#define _LINUX_NTFS_AOPS_H
+
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/fs.h>
+
+#include "inode.h"
+
+/**
+ * ntfs_unmap_page - release a page that was mapped using ntfs_map_page()
+ * @page:	the page to release
+ *
+ * Unpin, unmap and release a page that was obtained from ntfs_map_page().
+ */
+static inline void ntfs_unmap_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+/**
+ * ntfs_map_page - map a page into accessible memory, reading it if necessary
+ * @mapping:	address space for which to obtain the page
+ * @index:	index into the page cache for @mapping of the page to map
+ *
+ * Read a page from the page cache of the address space @mapping at position
+ * @index, where @index is in units of PAGE_CACHE_SIZE, and not in bytes.
+ *
+ * If the page is not in memory it is loaded from disk first using the readpage
+ * method defined in the address space operations of @mapping and the page is
+ * added to the page cache of @mapping in the process.
+ *
+ * If the page is in high memory it is mapped into memory directly addressible
+ * by the kernel.
+ *
+ * Finally the page count is incremented, thus pinning the page into place.
+ *
+ * The above means that page_address(page) can be used on all pages obtained
+ * with ntfs_map_page() to get the kernel virtual address of the page.
+ *
+ * When finished with the page, the caller has to call ntfs_unmap_page() to
+ * unpin, unmap and release the page.
+ *
+ * Note this does not grant exclusive access. If such is desired, the caller
+ * must provide it independently of the ntfs_{un}map_page() calls by using
+ * a {rw_}semaphore or other means of serialization. A spin lock cannot be
+ * used as ntfs_map_page() can block.
+ *
+ * The unlocked and uptodate page is returned on success or an encoded error
+ * on failure. Caller has to test for error using the IS_ERR() macro on the
+ * return value. If that evaluates to TRUE, the negative error code can be
+ * obtained using PTR_ERR() on the return value of ntfs_map_page().
+ */
+static inline struct page *ntfs_map_page(struct address_space *mapping,
+		unsigned long index)
+{
+	struct page *page = read_cache_page(mapping, index,
+			(filler_t*)mapping->a_ops->readpage, NULL);
+
+	if (!IS_ERR(page)) {
+		wait_on_page_locked(page);
+		kmap(page);
+		if (PageUptodate(page) && !PageError(page))
+			return page;
+		ntfs_unmap_page(page);
+		return ERR_PTR(-EIO);
+	}
+	return page;
+}
+
+#ifdef NTFS_RW
+
+extern void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs);
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_AOPS_H */
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index fdf20b85ceb4..865de0c2c3d0 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -1,5 +1,5 @@
 /**
- * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project.
+ * attrib.c - NTFS attribute operations.  Part of the Linux-NTFS project.
  *
  * Copyright (c) 2001-2004 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
@@ -21,915 +21,11 @@
  */
 
 #include <linux/buffer_head.h>
-#include "ntfs.h"
-#include "dir.h"
-
-/* Temporary helper functions -- might become macros */
-
-/**
- * ntfs_rl_mm - runlist memmove
- *
- * It is up to the caller to serialize access to the runlist @base.
- */
-static inline void ntfs_rl_mm(runlist_element *base, int dst, int src,
-		int size)
-{
-	if (likely((dst != src) && (size > 0)))
-		memmove(base + dst, base + src, size * sizeof (*base));
-}
-
-/**
- * ntfs_rl_mc - runlist memory copy
- *
- * It is up to the caller to serialize access to the runlists @dstbase and
- * @srcbase.
- */
-static inline void ntfs_rl_mc(runlist_element *dstbase, int dst,
-		runlist_element *srcbase, int src, int size)
-{
-	if (likely(size > 0))
-		memcpy(dstbase + dst, srcbase + src, size * sizeof(*dstbase));
-}
-
-/**
- * ntfs_rl_realloc - Reallocate memory for runlists
- * @rl:		original runlist
- * @old_size:	number of runlist elements in the original runlist @rl
- * @new_size:	number of runlist elements we need space for
- *
- * As the runlists grow, more memory will be required.  To prevent the
- * kernel having to allocate and reallocate large numbers of small bits of
- * memory, this function returns and entire page of memory.
- *
- * It is up to the caller to serialize access to the runlist @rl.
- *
- * N.B.  If the new allocation doesn't require a different number of pages in
- *       memory, the function will return the original pointer.
- *
- * On success, return a pointer to the newly allocated, or recycled, memory.
- * On error, return -errno. The following error codes are defined:
- *	-ENOMEM	- Not enough memory to allocate runlist array.
- *	-EINVAL	- Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_realloc(runlist_element *rl,
-		int old_size, int new_size)
-{
-	runlist_element *new_rl;
-
-	old_size = PAGE_ALIGN(old_size * sizeof(*rl));
-	new_size = PAGE_ALIGN(new_size * sizeof(*rl));
-	if (old_size == new_size)
-		return rl;
-
-	new_rl = ntfs_malloc_nofs(new_size);
-	if (unlikely(!new_rl))
-		return ERR_PTR(-ENOMEM);
-
-	if (likely(rl != NULL)) {
-		if (unlikely(old_size > new_size))
-			old_size = new_size;
-		memcpy(new_rl, rl, old_size);
-		ntfs_free(rl);
-	}
-	return new_rl;
-}
-
-/**
- * ntfs_are_rl_mergeable - test if two runlists can be joined together
- * @dst:	original runlist
- * @src:	new runlist to test for mergeability with @dst
- *
- * Test if two runlists can be joined together. For this, their VCNs and LCNs
- * must be adjacent.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * Return: TRUE   Success, the runlists can be merged.
- *	   FALSE  Failure, the runlists cannot be merged.
- */
-static inline BOOL ntfs_are_rl_mergeable(runlist_element *dst,
-		runlist_element *src)
-{
-	BUG_ON(!dst);
-	BUG_ON(!src);
-
-	if ((dst->lcn < 0) || (src->lcn < 0))     /* Are we merging holes? */
-		return FALSE;
-	if ((dst->lcn + dst->length) != src->lcn) /* Are the runs contiguous? */
-		return FALSE;
-	if ((dst->vcn + dst->length) != src->vcn) /* Are the runs misaligned? */
-		return FALSE;
-
-	return TRUE;
-}
-
-/**
- * __ntfs_rl_merge - merge two runlists without testing if they can be merged
- * @dst:	original, destination runlist
- * @src:	new runlist to merge with @dst
- *
- * Merge the two runlists, writing into the destination runlist @dst. The
- * caller must make sure the runlists can be merged or this will corrupt the
- * destination runlist.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- */
-static inline void __ntfs_rl_merge(runlist_element *dst, runlist_element *src)
-{
-	dst->length += src->length;
-}
-
-/**
- * ntfs_rl_merge - test if two runlists can be joined together and merge them
- * @dst:	original, destination runlist
- * @src:	new runlist to merge with @dst
- *
- * Test if two runlists can be joined together. For this, their VCNs and LCNs
- * must be adjacent. If they can be merged, perform the merge, writing into
- * the destination runlist @dst.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * Return: TRUE   Success, the runlists have been merged.
- *	   FALSE  Failure, the runlists cannot be merged and have not been
- *		  modified.
- */
-static inline BOOL ntfs_rl_merge(runlist_element *dst, runlist_element *src)
-{
-	BOOL merge = ntfs_are_rl_mergeable(dst, src);
-
-	if (merge)
-		__ntfs_rl_merge(dst, src);
-	return merge;
-}
-
-/**
- * ntfs_rl_append - append a runlist after a given element
- * @dst:	original runlist to be worked on
- * @dsize:	number of elements in @dst (including end marker)
- * @src:	runlist to be inserted into @dst
- * @ssize:	number of elements in @src (excluding end marker)
- * @loc:	append the new runlist @src after this element in @dst
- *
- * Append the runlist @src after element @loc in @dst.  Merge the right end of
- * the new runlist, if necessary. Adjust the size of the hole before the
- * appended runlist.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @dst and @src are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- *	-ENOMEM	- Not enough memory to allocate runlist array.
- *	-EINVAL	- Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_append(runlist_element *dst,
-		int dsize, runlist_element *src, int ssize, int loc)
-{
-	BOOL right;
-	int magic;
-
-	BUG_ON(!dst);
-	BUG_ON(!src);
-
-	/* First, check if the right hand end needs merging. */
-	right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
-
-	/* Space required: @dst size + @src size, less one if we merged. */
-	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - right);
-	if (IS_ERR(dst))
-		return dst;
-	/*
-	 * We are guaranteed to succeed from here so can start modifying the
-	 * original runlists.
-	 */
-
-	/* First, merge the right hand end, if necessary. */
-	if (right)
-		__ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
-
-	magic = loc + ssize;
-
-	/* Move the tail of @dst out of the way, then copy in @src. */
-	ntfs_rl_mm(dst, magic + 1, loc + 1 + right, dsize - loc - 1 - right);
-	ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
-
-	/* Adjust the size of the preceding hole. */
-	dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn;
-
-	/* We may have changed the length of the file, so fix the end marker */
-	if (dst[magic + 1].lcn == LCN_ENOENT)
-		dst[magic + 1].vcn = dst[magic].vcn + dst[magic].length;
-
-	return dst;
-}
-
-/**
- * ntfs_rl_insert - insert a runlist into another
- * @dst:	original runlist to be worked on
- * @dsize:	number of elements in @dst (including end marker)
- * @src:	new runlist to be inserted
- * @ssize:	number of elements in @src (excluding end marker)
- * @loc:	insert the new runlist @src before this element in @dst
- *
- * Insert the runlist @src before element @loc in the runlist @dst. Merge the
- * left end of the new runlist, if necessary. Adjust the size of the hole
- * after the inserted runlist.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @dst and @src are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- *	-ENOMEM	- Not enough memory to allocate runlist array.
- *	-EINVAL	- Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_insert(runlist_element *dst,
-		int dsize, runlist_element *src, int ssize, int loc)
-{
-	BOOL left = FALSE;
-	BOOL disc = FALSE;	/* Discontinuity */
-	BOOL hole = FALSE;	/* Following a hole */
-	int magic;
-
-	BUG_ON(!dst);
-	BUG_ON(!src);
-
-	/* disc => Discontinuity between the end of @dst and the start of @src.
-	 *	   This means we might need to insert a hole.
-	 * hole => @dst ends with a hole or an unmapped region which we can
-	 *	   extend to match the discontinuity. */
-	if (loc == 0)
-		disc = (src[0].vcn > 0);
-	else {
-		s64 merged_length;
-
-		left = ntfs_are_rl_mergeable(dst + loc - 1, src);
-
-		merged_length = dst[loc - 1].length;
-		if (left)
-			merged_length += src->length;
-
-		disc = (src[0].vcn > dst[loc - 1].vcn + merged_length);
-		if (disc)
-			hole = (dst[loc - 1].lcn == LCN_HOLE);
-	}
-
-	/* Space required: @dst size + @src size, less one if we merged, plus
-	 * one if there was a discontinuity, less one for a trailing hole. */
-	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left + disc - hole);
-	if (IS_ERR(dst))
-		return dst;
-	/*
-	 * We are guaranteed to succeed from here so can start modifying the
-	 * original runlist.
-	 */
-
-	if (left)
-		__ntfs_rl_merge(dst + loc - 1, src);
-
-	magic = loc + ssize - left + disc - hole;
-
-	/* Move the tail of @dst out of the way, then copy in @src. */
-	ntfs_rl_mm(dst, magic, loc, dsize - loc);
-	ntfs_rl_mc(dst, loc + disc - hole, src, left, ssize - left);
-
-	/* Adjust the VCN of the last run ... */
-	if (dst[magic].lcn <= LCN_HOLE)
-		dst[magic].vcn = dst[magic - 1].vcn + dst[magic - 1].length;
-	/* ... and the length. */
-	if (dst[magic].lcn == LCN_HOLE || dst[magic].lcn == LCN_RL_NOT_MAPPED)
-		dst[magic].length = dst[magic + 1].vcn - dst[magic].vcn;
-
-	/* Writing beyond the end of the file and there's a discontinuity. */
-	if (disc) {
-		if (hole)
-			dst[loc - 1].length = dst[loc].vcn - dst[loc - 1].vcn;
-		else {
-			if (loc > 0) {
-				dst[loc].vcn = dst[loc - 1].vcn +
-						dst[loc - 1].length;
-				dst[loc].length = dst[loc + 1].vcn -
-						dst[loc].vcn;
-			} else {
-				dst[loc].vcn = 0;
-				dst[loc].length = dst[loc + 1].vcn;
-			}
-			dst[loc].lcn = LCN_RL_NOT_MAPPED;
-		}
-
-		magic += hole;
-
-		if (dst[magic].lcn == LCN_ENOENT)
-			dst[magic].vcn = dst[magic - 1].vcn +
-					dst[magic - 1].length;
-	}
-	return dst;
-}
-
-/**
- * ntfs_rl_replace - overwrite a runlist element with another runlist
- * @dst:	original runlist to be worked on
- * @dsize:	number of elements in @dst (including end marker)
- * @src:	new runlist to be inserted
- * @ssize:	number of elements in @src (excluding end marker)
- * @loc:	index in runlist @dst to overwrite with @src
- *
- * Replace the runlist element @dst at @loc with @src. Merge the left and
- * right ends of the inserted runlist, if necessary.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @dst and @src are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- *	-ENOMEM	- Not enough memory to allocate runlist array.
- *	-EINVAL	- Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_replace(runlist_element *dst,
-		int dsize, runlist_element *src, int ssize, int loc)
-{
-	BOOL left = FALSE;
-	BOOL right;
-	int magic;
-
-	BUG_ON(!dst);
-	BUG_ON(!src);
-
-	/* First, merge the left and right ends, if necessary. */
-	right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
-	if (loc > 0)
-		left = ntfs_are_rl_mergeable(dst + loc - 1, src);
-
-	/* Allocate some space. We'll need less if the left, right, or both
-	 * ends were merged. */
-	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left - right);
-	if (IS_ERR(dst))
-		return dst;
-	/*
-	 * We are guaranteed to succeed from here so can start modifying the
-	 * original runlists.
-	 */
-	if (right)
-		__ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
-	if (left)
-		__ntfs_rl_merge(dst + loc - 1, src);
-
-	/* FIXME: What does this mean? (AIA) */
-	magic = loc + ssize - left;
-
-	/* Move the tail of @dst out of the way, then copy in @src. */
-	ntfs_rl_mm(dst, magic, loc + right + 1, dsize - loc - right - 1);
-	ntfs_rl_mc(dst, loc, src, left, ssize - left);
-
-	/* We may have changed the length of the file, so fix the end marker */
-	if (dst[magic].lcn == LCN_ENOENT)
-		dst[magic].vcn = dst[magic - 1].vcn + dst[magic - 1].length;
-	return dst;
-}
-
-/**
- * ntfs_rl_split - insert a runlist into the centre of a hole
- * @dst:	original runlist to be worked on
- * @dsize:	number of elements in @dst (including end marker)
- * @src:	new runlist to be inserted
- * @ssize:	number of elements in @src (excluding end marker)
- * @loc:	index in runlist @dst at which to split and insert @src
- *
- * Split the runlist @dst at @loc into two and insert @new in between the two
- * fragments. No merging of runlists is necessary. Adjust the size of the
- * holes either side.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @dst and @src are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- *	-ENOMEM	- Not enough memory to allocate runlist array.
- *	-EINVAL	- Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_split(runlist_element *dst, int dsize,
-		runlist_element *src, int ssize, int loc)
-{
-	BUG_ON(!dst);
-	BUG_ON(!src);
-
-	/* Space required: @dst size + @src size + one new hole. */
-	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize + 1);
-	if (IS_ERR(dst))
-		return dst;
-	/*
-	 * We are guaranteed to succeed from here so can start modifying the
-	 * original runlists.
-	 */
-
-	/* Move the tail of @dst out of the way, then copy in @src. */
-	ntfs_rl_mm(dst, loc + 1 + ssize, loc, dsize - loc);
-	ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
-
-	/* Adjust the size of the holes either size of @src. */
-	dst[loc].length		= dst[loc+1].vcn       - dst[loc].vcn;
-	dst[loc+ssize+1].vcn    = dst[loc+ssize].vcn   + dst[loc+ssize].length;
-	dst[loc+ssize+1].length = dst[loc+ssize+2].vcn - dst[loc+ssize+1].vcn;
-
-	return dst;
-}
-
-/**
- * ntfs_merge_runlists - merge two runlists into one
- * @drl:	original runlist to be worked on
- * @srl:	new runlist to be merged into @drl
- *
- * First we sanity check the two runlists @srl and @drl to make sure that they
- * are sensible and can be merged. The runlist @srl must be either after the
- * runlist @drl or completely within a hole (or unmapped region) in @drl.
- *
- * It is up to the caller to serialize access to the runlists @drl and @srl.
- *
- * Merging of runlists is necessary in two cases:
- *   1. When attribute lists are used and a further extent is being mapped.
- *   2. When new clusters are allocated to fill a hole or extend a file.
- *
- * There are four possible ways @srl can be merged. It can:
- *	- be inserted at the beginning of a hole,
- *	- split the hole in two and be inserted between the two fragments,
- *	- be appended at the end of a hole, or it can
- *	- replace the whole hole.
- * It can also be appended to the end of the runlist, which is just a variant
- * of the insert case.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @drl and @srl are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- *	-ENOMEM	- Not enough memory to allocate runlist array.
- *	-EINVAL	- Invalid parameters were passed in.
- *	-ERANGE	- The runlists overlap and cannot be merged.
- */
-runlist_element *ntfs_merge_runlists(runlist_element *drl,
-		runlist_element *srl)
-{
-	int di, si;		/* Current index into @[ds]rl. */
-	int sstart;		/* First index with lcn > LCN_RL_NOT_MAPPED. */
-	int dins;		/* Index into @drl at which to insert @srl. */
-	int dend, send;		/* Last index into @[ds]rl. */
-	int dfinal, sfinal;	/* The last index into @[ds]rl with
-				   lcn >= LCN_HOLE. */
-	int marker = 0;
-	VCN marker_vcn = 0;
-
-#ifdef DEBUG
-	ntfs_debug("dst:");
-	ntfs_debug_dump_runlist(drl);
-	ntfs_debug("src:");
-	ntfs_debug_dump_runlist(srl);
-#endif
-
-	/* Check for silly calling... */
-	if (unlikely(!srl))
-		return drl;
-	if (IS_ERR(srl) || IS_ERR(drl))
-		return ERR_PTR(-EINVAL);
 
-	/* Check for the case where the first mapping is being done now. */
-	if (unlikely(!drl)) {
-		drl = srl;
-		/* Complete the source runlist if necessary. */
-		if (unlikely(drl[0].vcn)) {
-			/* Scan to the end of the source runlist. */
-			for (dend = 0; likely(drl[dend].length); dend++)
-				;
-			drl = ntfs_rl_realloc(drl, dend, dend + 1);
-			if (IS_ERR(drl))
-				return drl;
-			/* Insert start element at the front of the runlist. */
-			ntfs_rl_mm(drl, 1, 0, dend);
-			drl[0].vcn = 0;
-			drl[0].lcn = LCN_RL_NOT_MAPPED;
-			drl[0].length = drl[1].vcn;
-		}
-		goto finished;
-	}
-
-	si = di = 0;
-
-	/* Skip any unmapped start element(s) in the source runlist. */
-	while (srl[si].length && srl[si].lcn < (LCN)LCN_HOLE)
-		si++;
-
-	/* Can't have an entirely unmapped source runlist. */
-	BUG_ON(!srl[si].length);
-
-	/* Record the starting points. */
-	sstart = si;
-
-	/*
-	 * Skip forward in @drl until we reach the position where @srl needs to
-	 * be inserted. If we reach the end of @drl, @srl just needs to be
-	 * appended to @drl.
-	 */
-	for (; drl[di].length; di++) {
-		if (drl[di].vcn + drl[di].length > srl[sstart].vcn)
-			break;
-	}
-	dins = di;
-
-	/* Sanity check for illegal overlaps. */
-	if ((drl[di].vcn == srl[si].vcn) && (drl[di].lcn >= 0) &&
-			(srl[si].lcn >= 0)) {
-		ntfs_error(NULL, "Run lists overlap. Cannot merge!");
-		return ERR_PTR(-ERANGE);
-	}
-
-	/* Scan to the end of both runlists in order to know their sizes. */
-	for (send = si; srl[send].length; send++)
-		;
-	for (dend = di; drl[dend].length; dend++)
-		;
-
-	if (srl[send].lcn == (LCN)LCN_ENOENT)
-		marker_vcn = srl[marker = send].vcn;
-
-	/* Scan to the last element with lcn >= LCN_HOLE. */
-	for (sfinal = send; sfinal >= 0 && srl[sfinal].lcn < LCN_HOLE; sfinal--)
-		;
-	for (dfinal = dend; dfinal >= 0 && drl[dfinal].lcn < LCN_HOLE; dfinal--)
-		;
-
-	{
-	BOOL start;
-	BOOL finish;
-	int ds = dend + 1;		/* Number of elements in drl & srl */
-	int ss = sfinal - sstart + 1;
-
-	start  = ((drl[dins].lcn <  LCN_RL_NOT_MAPPED) ||    /* End of file   */
-		  (drl[dins].vcn == srl[sstart].vcn));	     /* Start of hole */
-	finish = ((drl[dins].lcn >= LCN_RL_NOT_MAPPED) &&    /* End of file   */
-		 ((drl[dins].vcn + drl[dins].length) <=      /* End of hole   */
-		  (srl[send - 1].vcn + srl[send - 1].length)));
-
-	/* Or we'll lose an end marker */
-	if (start && finish && (drl[dins].length == 0))
-		ss++;
-	if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn))
-		finish = FALSE;
-#if 0
-	ntfs_debug("dfinal = %i, dend = %i", dfinal, dend);
-	ntfs_debug("sstart = %i, sfinal = %i, send = %i", sstart, sfinal, send);
-	ntfs_debug("start = %i, finish = %i", start, finish);
-	ntfs_debug("ds = %i, ss = %i, dins = %i", ds, ss, dins);
-#endif
-	if (start) {
-		if (finish)
-			drl = ntfs_rl_replace(drl, ds, srl + sstart, ss, dins);
-		else
-			drl = ntfs_rl_insert(drl, ds, srl + sstart, ss, dins);
-	} else {
-		if (finish)
-			drl = ntfs_rl_append(drl, ds, srl + sstart, ss, dins);
-		else
-			drl = ntfs_rl_split(drl, ds, srl + sstart, ss, dins);
-	}
-	if (IS_ERR(drl)) {
-		ntfs_error(NULL, "Merge failed.");
-		return drl;
-	}
-	ntfs_free(srl);
-	if (marker) {
-		ntfs_debug("Triggering marker code.");
-		for (ds = dend; drl[ds].length; ds++)
-			;
-		/* We only need to care if @srl ended after @drl. */
-		if (drl[ds].vcn <= marker_vcn) {
-			int slots = 0;
-
-			if (drl[ds].vcn == marker_vcn) {
-				ntfs_debug("Old marker = 0x%llx, replacing "
-						"with LCN_ENOENT.",
-						(unsigned long long)
-						drl[ds].lcn);
-				drl[ds].lcn = (LCN)LCN_ENOENT;
-				goto finished;
-			}
-			/*
-			 * We need to create an unmapped runlist element in
-			 * @drl or extend an existing one before adding the
-			 * ENOENT terminator.
-			 */
-			if (drl[ds].lcn == (LCN)LCN_ENOENT) {
-				ds--;
-				slots = 1;
-			}
-			if (drl[ds].lcn != (LCN)LCN_RL_NOT_MAPPED) {
-				/* Add an unmapped runlist element. */
-				if (!slots) {
-					/* FIXME/TODO: We need to have the
-					 * extra memory already! (AIA) */
-					drl = ntfs_rl_realloc(drl, ds, ds + 2);
-					if (!drl)
-						goto critical_error;
-					slots = 2;
-				}
-				ds++;
-				/* Need to set vcn if it isn't set already. */
-				if (slots != 1)
-					drl[ds].vcn = drl[ds - 1].vcn +
-							drl[ds - 1].length;
-				drl[ds].lcn = (LCN)LCN_RL_NOT_MAPPED;
-				/* We now used up a slot. */
-				slots--;
-			}
-			drl[ds].length = marker_vcn - drl[ds].vcn;
-			/* Finally add the ENOENT terminator. */
-			ds++;
-			if (!slots) {
-				/* FIXME/TODO: We need to have the extra
-				 * memory already! (AIA) */
-				drl = ntfs_rl_realloc(drl, ds, ds + 1);
-				if (!drl)
-					goto critical_error;
-			}
-			drl[ds].vcn = marker_vcn;
-			drl[ds].lcn = (LCN)LCN_ENOENT;
-			drl[ds].length = (s64)0;
-		}
-	}
-	}
-
-finished:
-	/* The merge was completed successfully. */
-	ntfs_debug("Merged runlist:");
-	ntfs_debug_dump_runlist(drl);
-	return drl;
-
-critical_error:
-	/* Critical error! We cannot afford to fail here. */
-	ntfs_error(NULL, "Critical error! Not enough memory.");
-	panic("NTFS: Cannot continue.");
-}
-
-/**
- * decompress_mapping_pairs - convert mapping pairs array to runlist
- * @vol:	ntfs volume on which the attribute resides
- * @attr:	attribute record whose mapping pairs array to decompress
- * @old_rl:	optional runlist in which to insert @attr's runlist
- *
- * It is up to the caller to serialize access to the runlist @old_rl.
- *
- * Decompress the attribute @attr's mapping pairs array into a runlist. On
- * success, return the decompressed runlist.
- *
- * If @old_rl is not NULL, decompressed runlist is inserted into the
- * appropriate place in @old_rl and the resultant, combined runlist is
- * returned. The original @old_rl is deallocated.
- *
- * On error, return -errno. @old_rl is left unmodified in that case.
- *
- * The following error codes are defined:
- *	-ENOMEM	- Not enough memory to allocate runlist array.
- *	-EIO	- Corrupt runlist.
- *	-EINVAL	- Invalid parameters were passed in.
- *	-ERANGE	- The two runlists overlap.
- *
- * FIXME: For now we take the conceptionally simplest approach of creating the
- * new runlist disregarding the already existing one and then splicing the
- * two into one, if that is possible (we check for overlap and discard the new
- * runlist if overlap present before returning ERR_PTR(-ERANGE)).
- */
-runlist_element *decompress_mapping_pairs(const ntfs_volume *vol,
-		const ATTR_RECORD *attr, runlist_element *old_rl)
-{
-	VCN vcn;		/* Current vcn. */
-	LCN lcn;		/* Current lcn. */
-	s64 deltaxcn;		/* Change in [vl]cn. */
-	runlist_element *rl;	/* The output runlist. */
-	u8 *buf;		/* Current position in mapping pairs array. */
-	u8 *attr_end;		/* End of attribute. */
-	int rlsize;		/* Size of runlist buffer. */
-	u16 rlpos;		/* Current runlist position in units of
-				   runlist_elements. */
-	u8 b;			/* Current byte offset in buf. */
-
-#ifdef DEBUG
-	/* Make sure attr exists and is non-resident. */
-	if (!attr || !attr->non_resident || sle64_to_cpu(
-			attr->data.non_resident.lowest_vcn) < (VCN)0) {
-		ntfs_error(vol->sb, "Invalid arguments.");
-		return ERR_PTR(-EINVAL);
-	}
-#endif
-	/* Start at vcn = lowest_vcn and lcn 0. */
-	vcn = sle64_to_cpu(attr->data.non_resident.lowest_vcn);
-	lcn = 0;
-	/* Get start of the mapping pairs array. */
-	buf = (u8*)attr + le16_to_cpu(
-			attr->data.non_resident.mapping_pairs_offset);
-	attr_end = (u8*)attr + le32_to_cpu(attr->length);
-	if (unlikely(buf < (u8*)attr || buf > attr_end)) {
-		ntfs_error(vol->sb, "Corrupt attribute.");
-		return ERR_PTR(-EIO);
-	}
-	/* Current position in runlist array. */
-	rlpos = 0;
-	/* Allocate first page and set current runlist size to one page. */
-	rl = ntfs_malloc_nofs(rlsize = PAGE_SIZE);
-	if (unlikely(!rl))
-		return ERR_PTR(-ENOMEM);
-	/* Insert unmapped starting element if necessary. */
-	if (vcn) {
-		rl->vcn = (VCN)0;
-		rl->lcn = (LCN)LCN_RL_NOT_MAPPED;
-		rl->length = vcn;
-		rlpos++;
-	}
-	while (buf < attr_end && *buf) {
-		/*
-		 * Allocate more memory if needed, including space for the
-		 * not-mapped and terminator elements. ntfs_malloc_nofs()
-		 * operates on whole pages only.
-		 */
-		if (((rlpos + 3) * sizeof(*old_rl)) > rlsize) {
-			runlist_element *rl2;
-
-			rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE);
-			if (unlikely(!rl2)) {
-				ntfs_free(rl);
-				return ERR_PTR(-ENOMEM);
-			}
-			memcpy(rl2, rl, rlsize);
-			ntfs_free(rl);
-			rl = rl2;
-			rlsize += PAGE_SIZE;
-		}
-		/* Enter the current vcn into the current runlist element. */
-		rl[rlpos].vcn = vcn;
-		/*
-		 * Get the change in vcn, i.e. the run length in clusters.
-		 * Doing it this way ensures that we signextend negative values.
-		 * A negative run length doesn't make any sense, but hey, I
-		 * didn't make up the NTFS specs and Windows NT4 treats the run
-		 * length as a signed value so that's how it is...
-		 */
-		b = *buf & 0xf;
-		if (b) {
-			if (unlikely(buf + b > attr_end))
-				goto io_error;
-			for (deltaxcn = (s8)buf[b--]; b; b--)
-				deltaxcn = (deltaxcn << 8) + buf[b];
-		} else { /* The length entry is compulsory. */
-			ntfs_error(vol->sb, "Missing length entry in mapping "
-					"pairs array.");
-			deltaxcn = (s64)-1;
-		}
-		/*
-		 * Assume a negative length to indicate data corruption and
-		 * hence clean-up and return NULL.
-		 */
-		if (unlikely(deltaxcn < 0)) {
-			ntfs_error(vol->sb, "Invalid length in mapping pairs "
-					"array.");
-			goto err_out;
-		}
-		/*
-		 * Enter the current run length into the current runlist
-		 * element.
-		 */
-		rl[rlpos].length = deltaxcn;
-		/* Increment the current vcn by the current run length. */
-		vcn += deltaxcn;
-		/*
-		 * There might be no lcn change at all, as is the case for
-		 * sparse clusters on NTFS 3.0+, in which case we set the lcn
-		 * to LCN_HOLE.
-		 */
-		if (!(*buf & 0xf0))
-			rl[rlpos].lcn = (LCN)LCN_HOLE;
-		else {
-			/* Get the lcn change which really can be negative. */
-			u8 b2 = *buf & 0xf;
-			b = b2 + ((*buf >> 4) & 0xf);
-			if (buf + b > attr_end)
-				goto io_error;
-			for (deltaxcn = (s8)buf[b--]; b > b2; b--)
-				deltaxcn = (deltaxcn << 8) + buf[b];
-			/* Change the current lcn to its new value. */
-			lcn += deltaxcn;
-#ifdef DEBUG
-			/*
-			 * On NTFS 1.2-, apparently can have lcn == -1 to
-			 * indicate a hole. But we haven't verified ourselves
-			 * whether it is really the lcn or the deltaxcn that is
-			 * -1. So if either is found give us a message so we
-			 * can investigate it further!
-			 */
-			if (vol->major_ver < 3) {
-				if (unlikely(deltaxcn == (LCN)-1))
-					ntfs_error(vol->sb, "lcn delta == -1");
-				if (unlikely(lcn == (LCN)-1))
-					ntfs_error(vol->sb, "lcn == -1");
-			}
-#endif
-			/* Check lcn is not below -1. */
-			if (unlikely(lcn < (LCN)-1)) {
-				ntfs_error(vol->sb, "Invalid LCN < -1 in "
-						"mapping pairs array.");
-				goto err_out;
-			}
-			/* Enter the current lcn into the runlist element. */
-			rl[rlpos].lcn = lcn;
-		}
-		/* Get to the next runlist element. */
-		rlpos++;
-		/* Increment the buffer position to the next mapping pair. */
-		buf += (*buf & 0xf) + ((*buf >> 4) & 0xf) + 1;
-	}
-	if (unlikely(buf >= attr_end))
-		goto io_error;
-	/*
-	 * If there is a highest_vcn specified, it must be equal to the final
-	 * vcn in the runlist - 1, or something has gone badly wrong.
-	 */
-	deltaxcn = sle64_to_cpu(attr->data.non_resident.highest_vcn);
-	if (unlikely(deltaxcn && vcn - 1 != deltaxcn)) {
-mpa_err:
-		ntfs_error(vol->sb, "Corrupt mapping pairs array in "
-				"non-resident attribute.");
-		goto err_out;
-	}
-	/* Setup not mapped runlist element if this is the base extent. */
-	if (!attr->data.non_resident.lowest_vcn) {
-		VCN max_cluster;
-
-		max_cluster = (sle64_to_cpu(
-				attr->data.non_resident.allocated_size) +
-				vol->cluster_size - 1) >>
-				vol->cluster_size_bits;
-		/*
-		 * If there is a difference between the highest_vcn and the
-		 * highest cluster, the runlist is either corrupt or, more
-		 * likely, there are more extents following this one.
-		 */
-		if (deltaxcn < --max_cluster) {
-			ntfs_debug("More extents to follow; deltaxcn = 0x%llx, "
-					"max_cluster = 0x%llx",
-					(unsigned long long)deltaxcn,
-					(unsigned long long)max_cluster);
-			rl[rlpos].vcn = vcn;
-			vcn += rl[rlpos].length = max_cluster - deltaxcn;
-			rl[rlpos].lcn = (LCN)LCN_RL_NOT_MAPPED;
-			rlpos++;
-		} else if (unlikely(deltaxcn > max_cluster)) {
-			ntfs_error(vol->sb, "Corrupt attribute. deltaxcn = "
-					"0x%llx, max_cluster = 0x%llx",
-					(unsigned long long)deltaxcn,
-					(unsigned long long)max_cluster);
-			goto mpa_err;
-		}
-		rl[rlpos].lcn = (LCN)LCN_ENOENT;
-	} else /* Not the base extent. There may be more extents to follow. */
-		rl[rlpos].lcn = (LCN)LCN_RL_NOT_MAPPED;
-
-	/* Setup terminating runlist element. */
-	rl[rlpos].vcn = vcn;
-	rl[rlpos].length = (s64)0;
-	/* If no existing runlist was specified, we are done. */
-	if (!old_rl) {
-		ntfs_debug("Mapping pairs array successfully decompressed:");
-		ntfs_debug_dump_runlist(rl);
-		return rl;
-	}
-	/* Now combine the new and old runlists checking for overlaps. */
-	old_rl = ntfs_merge_runlists(old_rl, rl);
-	if (likely(!IS_ERR(old_rl)))
-		return old_rl;
-	ntfs_free(rl);
-	ntfs_error(vol->sb, "Failed to merge runlists.");
-	return old_rl;
-io_error:
-	ntfs_error(vol->sb, "Corrupt attribute.");
-err_out:
-	ntfs_free(rl);
-	return ERR_PTR(-EIO);
-}
+#include "attrib.h"
+#include "debug.h"
+#include "mft.h"
+#include "ntfs.h"
 
 /**
  * ntfs_map_runlist - map (a part of) a runlist of an ntfs inode
@@ -973,10 +69,11 @@ int ntfs_map_runlist(ntfs_inode *ni, VCN vcn)
 
 	down_write(&ni->runlist.lock);
 	/* Make sure someone else didn't do the work while we were sleeping. */
-	if (likely(ntfs_vcn_to_lcn(ni->runlist.rl, vcn) <= LCN_RL_NOT_MAPPED)) {
+	if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <=
+			LCN_RL_NOT_MAPPED)) {
 		runlist_element *rl;
 
-		rl = decompress_mapping_pairs(ni->vol, ctx->attr,
+		rl = ntfs_mapping_pairs_decompress(ni->vol, ctx->attr,
 				ni->runlist.rl);
 		if (IS_ERR(rl))
 			err = PTR_ERR(rl);
@@ -993,63 +90,6 @@ err_out:
 }
 
 /**
- * ntfs_vcn_to_lcn - convert a vcn into a lcn given a runlist
- * @rl:		runlist to use for conversion
- * @vcn:	vcn to convert
- *
- * Convert the virtual cluster number @vcn of an attribute into a logical
- * cluster number (lcn) of a device using the runlist @rl to map vcns to their
- * corresponding lcns.
- *
- * It is up to the caller to serialize access to the runlist @rl.
- *
- * Since lcns must be >= 0, we use negative return values with special meaning:
- *
- * Return value			Meaning / Description
- * ==================================================
- *  -1 = LCN_HOLE		Hole / not allocated on disk.
- *  -2 = LCN_RL_NOT_MAPPED	This is part of the runlist which has not been
- *				inserted into the runlist yet.
- *  -3 = LCN_ENOENT		There is no such vcn in the attribute.
- *
- * Locking: - The caller must have locked the runlist (for reading or writing).
- *	    - This function does not touch the lock.
- */
-LCN ntfs_vcn_to_lcn(const runlist_element *rl, const VCN vcn)
-{
-	int i;
-
-	BUG_ON(vcn < 0);
-	/*
-	 * If rl is NULL, assume that we have found an unmapped runlist. The
-	 * caller can then attempt to map it and fail appropriately if
-	 * necessary.
-	 */
-	if (unlikely(!rl))
-		return (LCN)LCN_RL_NOT_MAPPED;
-
-	/* Catch out of lower bounds vcn. */
-	if (unlikely(vcn < rl[0].vcn))
-		return (LCN)LCN_ENOENT;
-
-	for (i = 0; likely(rl[i].length); i++) {
-		if (unlikely(vcn < rl[i+1].vcn)) {
-			if (likely(rl[i].lcn >= (LCN)0))
-				return rl[i].lcn + (vcn - rl[i].vcn);
-			return rl[i].lcn;
-		}
-	}
-	/*
-	 * The terminator element is setup to the correct value, i.e. one of
-	 * LCN_HOLE, LCN_RL_NOT_MAPPED, or LCN_ENOENT.
-	 */
-	if (likely(rl[i].lcn < (LCN)0))
-		return rl[i].lcn;
-	/* Just in case... We could replace this with BUG() some day. */
-	return (LCN)LCN_ENOENT;
-}
-
-/**
  * ntfs_find_vcn - find a vcn in the runlist described by an ntfs inode
  * @ni:		ntfs inode describing the runlist to search
  * @vcn:	vcn to find
@@ -1104,7 +144,7 @@ lock_retry_remap:
 	if (likely(rl && vcn >= rl[0].vcn)) {
 		while (likely(rl->length)) {
 			if (likely(vcn < rl[1].vcn)) {
-				if (likely(rl->lcn >= (LCN)LCN_HOLE)) {
+				if (likely(rl->lcn >= LCN_HOLE)) {
 					ntfs_debug("Done.");
 					return rl;
 				}
@@ -1112,8 +152,8 @@ lock_retry_remap:
 			}
 			rl++;
 		}
-		if (likely(rl->lcn != (LCN)LCN_RL_NOT_MAPPED)) {
-			if (likely(rl->lcn == (LCN)LCN_ENOENT))
+		if (likely(rl->lcn != LCN_RL_NOT_MAPPED)) {
+			if (likely(rl->lcn == LCN_ENOENT))
 				err = -ENOENT;
 			else
 				err = -EIO;
@@ -1362,14 +402,14 @@ int load_attribute_list(ntfs_volume *vol, runlist *runlist, u8 *al_start,
 	rl = runlist->rl;
 	/* Read all clusters specified by the runlist one run at a time. */
 	while (rl->length) {
-		lcn = ntfs_vcn_to_lcn(rl, rl->vcn);
+		lcn = ntfs_rl_vcn_to_lcn(rl, rl->vcn);
 		ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.",
 				(unsigned long long)rl->vcn,
 				(unsigned long long)lcn);
 		/* The attribute list cannot be sparse. */
 		if (lcn < 0) {
-			ntfs_error(sb, "ntfs_vcn_to_lcn() failed. Cannot read "
-					"attribute list.");
+			ntfs_error(sb, "ntfs_rl_vcn_to_lcn() failed.  Cannot "
+					"read attribute list.");
 			goto err_out;
 		}
 		block = lcn << vol->cluster_size_bits >> block_size_bits;
@@ -1908,3 +948,188 @@ void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx)
 	kmem_cache_free(ntfs_attr_ctx_cache, ctx);
 	return;
 }
+
+/**
+ * ntfs_attr_record_resize - resize an attribute record
+ * @m:		mft record containing attribute record
+ * @a:		attribute record to resize
+ * @new_size:	new size in bytes to which to resize the attribute record @a
+ *
+ * Resize the attribute record @a, i.e. the resident part of the attribute, in
+ * the mft record @m to @new_size bytes.
+ *
+ * Return 0 on success and -errno on error.  The following error codes are
+ * defined:
+ *	-ENOSPC	- Not enough space in the mft record @m to perform the resize.
+ *
+ * Note: On error, no modifications have been performed whatsoever.
+ *
+ * Warning: If you make a record smaller without having copied all the data you
+ *	    are interested in the data may be overwritten.
+ */
+int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size)
+{
+	ntfs_debug("Entering for new_size %u.", new_size);
+	/* Align to 8 bytes if it is not already done. */
+	if (new_size & 7)
+		new_size = (new_size + 7) & ~7;
+	/* If the actual attribute length has changed, move things around. */
+	if (new_size != le32_to_cpu(a->length)) {
+		u32 new_muse = le32_to_cpu(m->bytes_in_use) -
+				le32_to_cpu(a->length) + new_size;
+		/* Not enough space in this mft record. */
+		if (new_muse > le32_to_cpu(m->bytes_allocated))
+			return -ENOSPC;
+		/* Move attributes following @a to their new location. */
+		memmove((u8*)a + new_size, (u8*)a + le32_to_cpu(a->length),
+				le32_to_cpu(m->bytes_in_use) - ((u8*)a -
+				(u8*)m) - le32_to_cpu(a->length));
+		/* Adjust @m to reflect the change in used space. */
+		m->bytes_in_use = cpu_to_le32(new_muse);
+		/* Adjust @a to reflect the new size. */
+		if (new_size >= offsetof(ATTR_REC, length) + sizeof(a->length))
+			a->length = cpu_to_le32(new_size);
+	}
+	return 0;
+}
+
+/**
+ * ntfs_attr_set - fill (a part of) an attribute with a byte
+ * @ni:		ntfs inode describing the attribute to fill
+ * @ofs:	offset inside the attribute at which to start to fill
+ * @cnt:	number of bytes to fill
+ * @val:	the unsigned 8-bit value with which to fill the attribute
+ *
+ * Fill @cnt bytes of the attribute described by the ntfs inode @ni starting at
+ * byte offset @ofs inside the attribute with the constant byte @val.
+ *
+ * This function is effectively like memset() applied to an ntfs attribute.
+ *
+ * Return 0 on success and -errno on error.  An error code of -ESPIPE means
+ * that @ofs + @cnt were outside the end of the attribute and no write was
+ * performed.
+ */
+int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
+{
+	ntfs_volume *vol = ni->vol;
+	struct address_space *mapping;
+	struct page *page;
+	u8 *kaddr;
+	pgoff_t idx, end;
+	unsigned int start_ofs, end_ofs, size;
+
+	ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.",
+			(long long)ofs, (long long)cnt, val);
+	BUG_ON(ofs < 0);
+	BUG_ON(cnt < 0);
+	if (!cnt)
+		goto done;
+	mapping = VFS_I(ni)->i_mapping;
+	/* Work out the starting index and page offset. */
+	idx = ofs >> PAGE_CACHE_SHIFT;
+	start_ofs = ofs & ~PAGE_CACHE_MASK;
+	/* Work out the ending index and page offset. */
+	end = ofs + cnt;
+	end_ofs = end & ~PAGE_CACHE_MASK;
+	/* If the end is outside the inode size return -ESPIPE. */
+	if (unlikely(end > VFS_I(ni)->i_size)) {
+		ntfs_error(vol->sb, "Request exceeds end of attribute.");
+		return -ESPIPE;
+	}
+	end >>= PAGE_CACHE_SHIFT;
+	/* If there is a first partial page, need to do it the slow way. */
+	if (start_ofs) {
+		page = read_cache_page(mapping, idx,
+				(filler_t*)mapping->a_ops->readpage, NULL);
+		if (IS_ERR(page)) {
+			ntfs_error(vol->sb, "Failed to read first partial "
+					"page (sync error, index 0x%lx).", idx);
+			return PTR_ERR(page);
+		}
+		wait_on_page_locked(page);
+		if (unlikely(!PageUptodate(page))) {
+			ntfs_error(vol->sb, "Failed to read first partial page "
+					"(async error, index 0x%lx).", idx);
+			page_cache_release(page);
+			return PTR_ERR(page);
+		}
+		/*
+		 * If the last page is the same as the first page, need to
+		 * limit the write to the end offset.
+		 */
+		size = PAGE_CACHE_SIZE;
+		if (idx == end)
+			size = end_ofs;
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr + start_ofs, val, size - start_ofs);
+		flush_dcache_page(page);
+		kunmap_atomic(kaddr, KM_USER0);
+		set_page_dirty(page);
+		page_cache_release(page);
+		if (idx == end)
+			goto done;
+		idx++;
+	}
+	/* Do the whole pages the fast way. */
+	for (; idx < end; idx++) {
+		/* Find or create the current page.  (The page is locked.) */
+		page = grab_cache_page(mapping, idx);
+		if (unlikely(!page)) {
+			ntfs_error(vol->sb, "Insufficient memory to grab "
+					"page (index 0x%lx).", idx);
+			return -ENOMEM;
+		}
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr, val, PAGE_CACHE_SIZE);
+		flush_dcache_page(page);
+		kunmap_atomic(kaddr, KM_USER0);
+		/*
+		 * If the page has buffers, mark them uptodate since buffer
+		 * state and not page state is definitive in 2.6 kernels.
+		 */
+		if (page_has_buffers(page)) {
+			struct buffer_head *bh, *head;
+
+			bh = head = page_buffers(page);
+			do {
+				set_buffer_uptodate(bh);
+			} while ((bh = bh->b_this_page) != head);
+		}
+		/* Now that buffers are uptodate, set the page uptodate, too. */
+		SetPageUptodate(page);
+		/*
+		 * Set the page and all its buffers dirty and mark the inode
+		 * dirty, too.  The VM will write the page later on.
+		 */
+		set_page_dirty(page);
+		/* Finally unlock and release the page. */
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	/* If there is a last partial page, need to do it the slow way. */
+	if (end_ofs) {
+		page = read_cache_page(mapping, idx,
+				(filler_t*)mapping->a_ops->readpage, NULL);
+		if (IS_ERR(page)) {
+			ntfs_error(vol->sb, "Failed to read last partial page "
+					"(sync error, index 0x%lx).", idx);
+			return PTR_ERR(page);
+		}
+		wait_on_page_locked(page);
+		if (unlikely(!PageUptodate(page))) {
+			ntfs_error(vol->sb, "Failed to read last partial page "
+					"(async error, index 0x%lx).", idx);
+			page_cache_release(page);
+			return PTR_ERR(page);
+		}
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr, val, end_ofs);
+		flush_dcache_page(page);
+		kunmap_atomic(kaddr, KM_USER0);
+		set_page_dirty(page);
+		page_cache_release(page);
+	}
+done:
+	ntfs_debug("Done.");
+	return 0;
+}
diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h
index 92899f4ff571..3d98ce8b7aa0 100644
--- a/fs/ntfs/attrib.h
+++ b/fs/ntfs/attrib.h
@@ -24,23 +24,11 @@
 #ifndef _LINUX_NTFS_ATTRIB_H
 #define _LINUX_NTFS_ATTRIB_H
 
-#include <linux/fs.h>
-
 #include "endian.h"
 #include "types.h"
 #include "layout.h"
-
-static inline void init_runlist(runlist *rl)
-{
-	rl->rl = NULL;
-	init_rwsem(&rl->lock);
-}
-
-typedef enum {
-	LCN_HOLE		= -1,	/* Keep this as highest value or die! */
-	LCN_RL_NOT_MAPPED	= -2,
-	LCN_ENOENT		= -3,
-} LCN_SPECIAL_VALUES;
+#include "inode.h"
+#include "runlist.h"
 
 /**
  * ntfs_attr_search_ctx - used in attribute search functions
@@ -71,13 +59,8 @@ typedef struct {
 	ATTR_RECORD *base_attr;
 } ntfs_attr_search_ctx;
 
-extern runlist_element *decompress_mapping_pairs(const ntfs_volume *vol,
-		const ATTR_RECORD *attr, runlist_element *old_rl);
-
 extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn);
 
-extern LCN ntfs_vcn_to_lcn(const runlist_element *rl, const VCN vcn);
-
 extern runlist_element *ntfs_find_vcn(ntfs_inode *ni, const VCN vcn,
 		const BOOL need_write);
 
@@ -101,4 +84,9 @@ extern ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni,
 		MFT_RECORD *mrec);
 extern void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx);
 
+extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size);
+
+extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt,
+		const u8 val);
+
 #endif /* _LINUX_NTFS_ATTRIB_H */
diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c
index b8f06111f6ef..12cf2e30c7dd 100644
--- a/fs/ntfs/bitmap.c
+++ b/fs/ntfs/bitmap.c
@@ -25,6 +25,7 @@
 
 #include "bitmap.h"
 #include "debug.h"
+#include "aops.h"
 #include "ntfs.h"
 
 /**
diff --git a/fs/ntfs/collate.c b/fs/ntfs/collate.c
index 31dd894a4319..4a28ab3898ef 100644
--- a/fs/ntfs/collate.c
+++ b/fs/ntfs/collate.c
@@ -19,8 +19,9 @@
  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
-#include "ntfs.h"
 #include "collate.h"
+#include "debug.h"
+#include "ntfs.h"
 
 static int ntfs_collate_binary(ntfs_volume *vol,
 		const void *data1, const int data1_len,
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index cc3aac147ba3..5d173da9b645 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -25,6 +25,9 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 
+#include "attrib.h"
+#include "inode.h"
+#include "debug.h"
 #include "ntfs.h"
 
 /**
@@ -600,9 +603,9 @@ lock_retry_remap:
 			/* Seek to element containing target vcn. */
 			while (rl->length && rl[1].vcn <= vcn)
 				rl++;
-			lcn = ntfs_vcn_to_lcn(rl, vcn);
+			lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
 		} else
-			lcn = (LCN)LCN_RL_NOT_MAPPED;
+			lcn = LCN_RL_NOT_MAPPED;
 		ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.",
 				(unsigned long long)vcn,
 				(unsigned long long)lcn);
@@ -926,7 +929,7 @@ map_rl_err:
 
 rl_err:
 	up_read(&ni->runlist.lock);
-	ntfs_error(vol->sb, "ntfs_vcn_to_lcn() failed. Cannot read "
+	ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn() failed. Cannot read "
 			"compression block.");
 	goto err_out;
 
diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h
index 63c62602224e..8ac37c33d127 100644
--- a/fs/ntfs/debug.h
+++ b/fs/ntfs/debug.h
@@ -22,13 +22,9 @@
 #ifndef _LINUX_NTFS_DEBUG_H
 #define _LINUX_NTFS_DEBUG_H
 
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/spinlock.h>
 #include <linux/fs.h>
 
-#include "inode.h"
-#include "attrib.h"
+#include "runlist.h"
 
 #ifdef DEBUG
 
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index ed9838e0c5d0..9b8594d4311b 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -21,8 +21,14 @@
  */
 
 #include <linux/smp_lock.h>
-#include "ntfs.h"
+#include <linux/buffer_head.h>
+
 #include "dir.h"
+#include "aops.h"
+#include "attrib.h"
+#include "mft.h"
+#include "debug.h"
+#include "ntfs.h"
 
 /**
  * The little endian Unicode string $I30 as a global constant.
@@ -1272,7 +1278,7 @@ get_next_bmp_page:
 	ntfs_debug("Reading bitmap with page index 0x%llx, bit ofs 0x%llx",
 			(unsigned long long)bmp_pos >> (3 + PAGE_CACHE_SHIFT),
 			(unsigned long long)bmp_pos &
-			((PAGE_CACHE_SIZE * 8) - 1));
+			(unsigned long long)((PAGE_CACHE_SIZE * 8) - 1));
 	bmp_page = ntfs_map_page(bmp_mapping,
 			bmp_pos >> (3 + PAGE_CACHE_SHIFT));
 	if (IS_ERR(bmp_page)) {
@@ -1386,8 +1392,8 @@ find_next_index_buffer:
 	 */
 	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
 		ntfs_debug("In index allocation, offset 0x%llx.",
-				(unsigned long long)ia_start + ((u8*)ie -
-				(u8*)ia));
+				(unsigned long long)ia_start +
+				(unsigned long long)((u8*)ie - (u8*)ia));
 		/* Bounds checks. */
 		if (unlikely((u8*)ie < (u8*)ia || (u8*)ie +
 				sizeof(INDEX_ENTRY_HEADER) > index_end ||
diff --git a/fs/ntfs/dir.h b/fs/ntfs/dir.h
index 90a8c3f65203..aea7582d561f 100644
--- a/fs/ntfs/dir.h
+++ b/fs/ntfs/dir.h
@@ -24,6 +24,8 @@
 #define _LINUX_NTFS_DIR_H
 
 #include "layout.h"
+#include "inode.h"
+#include "types.h"
 
 /*
  * ntfs_name is used to return the file name to the caller of
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index c7880d55b250..2ec83edb6893 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -19,6 +19,11 @@
  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+
+#include "inode.h"
+#include "debug.h"
 #include "ntfs.h"
 
 /**
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index b19a4882b689..aded65d13a65 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -19,9 +19,11 @@
  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
-#include "ntfs.h"
+#include "aops.h"
 #include "collate.h"
+#include "debug.h"
 #include "index.h"
+#include "ntfs.h"
 
 /**
  * ntfs_index_ctx_get - allocate and initialize a new index context
@@ -459,58 +461,3 @@ idx_err_out:
 	err = -EIO;
 	goto err_out;
 }
-
-#ifdef NTFS_RW
-
-/**
- * __ntfs_index_entry_mark_dirty - mark an index allocation entry dirty
- * @ictx:	ntfs index context describing the index entry
- *
- * NOTE: You want to use fs/ntfs/index.h::ntfs_index_entry_mark_dirty() instead!
- * 
- * Mark the index allocation entry described by the index entry context @ictx
- * dirty.
- *
- * The index entry must be in an index block belonging to the index allocation
- * attribute.  Mark the buffers belonging to the index record as well as the
- * page cache page the index block is in dirty.  This automatically marks the
- * VFS inode of the ntfs index inode to which the index entry belongs dirty,
- * too (I_DIRTY_PAGES) and this in turn ensures the page buffers, and hence the
- * dirty index block, will be written out to disk later.
- */
-void __ntfs_index_entry_mark_dirty(ntfs_index_context *ictx)
-{
-	ntfs_inode *ni;
-	struct page *page;
-	struct buffer_head *bh, *head;
-	unsigned int rec_start, rec_end, bh_size, bh_start, bh_end;
-
-	BUG_ON(ictx->is_in_root);
-	ni = ictx->idx_ni;
-	page = ictx->page;
-	BUG_ON(!page_has_buffers(page));
-	/*
-	 * If the index block is the same size as the page cache page, set all
-	 * the buffers in the page, as well as the page itself, dirty.
-	 */
-	if (ni->itype.index.block_size == PAGE_CACHE_SIZE) {
-		__set_page_dirty_buffers(page);
-		return;
-	}
-	/* Set only the buffers in which the index block is located dirty. */
-	rec_start = (unsigned int)((u8*)ictx->ia - (u8*)page_address(page));
-	rec_end = rec_start + ni->itype.index.block_size;
-	bh_size = ni->vol->sb->s_blocksize;
-	bh_start = 0;
-	bh = head = page_buffers(page);
-	do {
-		bh_end = bh_start + bh_size;
-		if ((bh_start >= rec_start) && (bh_end <= rec_end))
-			set_buffer_dirty(bh);
-		bh_start = bh_end;
-	} while ((bh = bh->b_this_page) != head);
-	/* Finally, set the page itself dirty, too. */
-	__set_page_dirty_nobuffers(page);
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/index.h b/fs/ntfs/index.h
index 159442dfa90a..846a489e8692 100644
--- a/fs/ntfs/index.h
+++ b/fs/ntfs/index.h
@@ -30,6 +30,7 @@
 #include "inode.h"
 #include "attrib.h"
 #include "mft.h"
+#include "aops.h"
 
 /**
  * @idx_ni:	index inode containing the @entry described by this context
@@ -115,8 +116,6 @@ static inline void ntfs_index_entry_flush_dcache_page(ntfs_index_context *ictx)
 		flush_dcache_page(ictx->page);
 }
 
-extern void __ntfs_index_entry_mark_dirty(ntfs_index_context *ictx);
-
 /**
  * ntfs_index_entry_mark_dirty - mark an index entry dirty
  * @ictx:	ntfs index context describing the index entry
@@ -140,7 +139,8 @@ static inline void ntfs_index_entry_mark_dirty(ntfs_index_context *ictx)
 	if (ictx->is_in_root)
 		mark_mft_record_dirty(ictx->actx->ntfs_ino);
 	else
-		__ntfs_index_entry_mark_dirty(ictx);
+		mark_ntfs_record_dirty(ictx->page,
+				(u8*)ictx->ia - (u8*)page_address(ictx->page));
 }
 
 #endif /* NTFS_RW */
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 873c25d9830f..181f9b8df900 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -25,11 +25,15 @@
 #include <linux/quotaops.h>
 #include <linux/mount.h>
 
-#include "ntfs.h"
+#include "aops.h"
 #include "dir.h"
+#include "debug.h"
 #include "inode.h"
 #include "attrib.h"
+#include "malloc.h"
+#include "mft.h"
 #include "time.h"
+#include "ntfs.h"
 
 /**
  * ntfs_test_inode - compare two (possibly fake) inodes for equality
@@ -369,20 +373,20 @@ void ntfs_destroy_extent_inode(ntfs_inode *ni)
  *
  * Return zero on success and -ENOMEM on error.
  */
-static void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni)
+void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni)
 {
 	ntfs_debug("Entering.");
 	ni->initialized_size = ni->allocated_size = 0;
 	ni->seq_no = 0;
 	atomic_set(&ni->count, 1);
 	ni->vol = NTFS_SB(sb);
-	init_runlist(&ni->runlist);
+	ntfs_init_runlist(&ni->runlist);
 	init_MUTEX(&ni->mrec_lock);
 	ni->page = NULL;
 	ni->page_ofs = 0;
 	ni->attr_list_size = 0;
 	ni->attr_list = NULL;
-	init_runlist(&ni->attr_list_rl);
+	ntfs_init_runlist(&ni->attr_list_rl);
 	ni->itype.index.bmp_ino = NULL;
 	ni->itype.index.block_size = 0;
 	ni->itype.index.vcn_size = 0;
@@ -392,17 +396,6 @@ static void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni)
 	init_MUTEX(&ni->extent_lock);
 	ni->nr_extents = 0;
 	ni->ext.base_ntfs_ino = NULL;
-	return;
-}
-
-static inline void ntfs_init_big_inode(struct inode *vi)
-{
-	ntfs_inode *ni = NTFS_I(vi);
-
-	ntfs_debug("Entering.");
-	__ntfs_init_inode(vi->i_sb, ni);
-	ni->mft_no = vi->i_ino;
-	return;
 }
 
 inline ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
@@ -601,14 +594,26 @@ static int ntfs_read_locked_inode(struct inode *vi)
 	 * Also if not a directory, it could be something else, rather than
 	 * a regular file. But again, will do for now.
 	 */
+	/* Everyone gets all permissions. */
+	vi->i_mode |= S_IRWXUGO;
+	/* If read-only, noone gets write permissions. */
+	if (IS_RDONLY(vi))
+		vi->i_mode &= ~S_IWUGO;
 	if (m->flags & MFT_RECORD_IS_DIRECTORY) {
 		vi->i_mode |= S_IFDIR;
+		/*
+		 * Apply the directory permissions mask set in the mount
+		 * options.
+		 */
+		vi->i_mode &= ~vol->dmask;
 		/* Things break without this kludge! */
 		if (vi->i_nlink > 1)
 			vi->i_nlink = 1;
-	} else
+	} else {
 		vi->i_mode |= S_IFREG;
-
+		/* Apply the file permissions mask set in the mount options. */
+		vi->i_mode &= ~vol->fmask;
+	}
 	/*
 	 * Find the standard information attribute in the mft record. At this
 	 * stage we haven't setup the attribute list stuff yet, so this could
@@ -701,7 +706,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
 			 * Setup the runlist. No need for locking as we have
 			 * exclusive access to the inode at this time.
 			 */
-			ni->attr_list_rl.rl = decompress_mapping_pairs(vol,
+			ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol,
 					ctx->attr, NULL);
 			if (IS_ERR(ni->attr_list_rl.rl)) {
 				err = PTR_ERR(ni->attr_list_rl.rl);
@@ -951,20 +956,9 @@ skip_attr_list_load:
 			goto unm_err_out;
 		}
 skip_large_dir_stuff:
-		/* Everyone gets read and scan permissions. */
-		vi->i_mode |= S_IRUGO | S_IXUGO;
-		/* If not read-only, set write permissions. */
-		if (!IS_RDONLY(vi))
-			vi->i_mode |= S_IWUGO;
-		/*
-		 * Apply the directory permissions mask set in the mount
-		 * options.
-		 */
-		vi->i_mode &= ~vol->dmask;
 		/* Setup the operations for this inode. */
 		vi->i_op = &ntfs_dir_inode_ops;
 		vi->i_fop = &ntfs_dir_ops;
-		vi->i_mapping->a_ops = &ntfs_mst_aops;
 	} else {
 		/* It is a file. */
 		ntfs_attr_reinit_search_ctx(ctx);
@@ -1098,18 +1092,14 @@ no_data_attr_special_case:
 		unmap_mft_record(ni);
 		m = NULL;
 		ctx = NULL;
-		/* Everyone gets all permissions. */
-		vi->i_mode |= S_IRWXUGO;
-		/* If read-only, noone gets write permissions. */
-		if (IS_RDONLY(vi))
-			vi->i_mode &= ~S_IWUGO;
-		/* Apply the file permissions mask set in the mount options. */
-		vi->i_mode &= ~vol->fmask;
 		/* Setup the operations for this inode. */
 		vi->i_op = &ntfs_file_inode_ops;
 		vi->i_fop = &ntfs_file_ops;
-		vi->i_mapping->a_ops = &ntfs_aops;
 	}
+	if (NInoMstProtected(ni))
+		vi->i_mapping->a_ops = &ntfs_mst_aops;
+	else
+		vi->i_mapping->a_ops = &ntfs_aops;
 	/*
 	 * The number of 512-byte blocks used on disk (for stat). This is in so
 	 * far inaccurate as it doesn't account for any named streams or other
@@ -1672,8 +1662,8 @@ err_out:
  *
  * We solve these problems by starting with the $DATA attribute before anything
  * else and iterating using ntfs_attr_lookup($DATA) over all extents.  As each
- * extent is found, we decompress_mapping_pairs() including the implied
- * ntfs_merge_runlists().  Each step of the iteration necessarily provides
+ * extent is found, we ntfs_mapping_pairs_decompress() including the implied
+ * ntfs_runlists_merge().  Each step of the iteration necessarily provides
  * sufficient information for the next step to complete.
  *
  * This should work but there are two possible pit falls (see inline comments
@@ -1762,7 +1752,7 @@ int ntfs_read_inode_mount(struct inode *vi)
 	vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
 
 	/* Provides readpage() and sync_page() for map_mft_record(). */
-	vi->i_mapping->a_ops = &ntfs_mft_aops;
+	vi->i_mapping->a_ops = &ntfs_mst_aops;
 
 	ctx = ntfs_attr_get_search_ctx(ni, m);
 	if (!ctx) {
@@ -1810,7 +1800,7 @@ int ntfs_read_inode_mount(struct inode *vi)
 				goto put_err_out;
 			}
 			/* Setup the runlist. */
-			ni->attr_list_rl.rl = decompress_mapping_pairs(vol,
+			ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol,
 					ctx->attr, NULL);
 			if (IS_ERR(ni->attr_list_rl.rl)) {
 				err = PTR_ERR(ni->attr_list_rl.rl);
@@ -1942,11 +1932,11 @@ int ntfs_read_inode_mount(struct inode *vi)
 		 * as we have exclusive access to the inode at this time and we
 		 * are a mount in progress task, too.
 		 */
-		nrl = decompress_mapping_pairs(vol, attr, ni->runlist.rl);
+		nrl = ntfs_mapping_pairs_decompress(vol, attr, ni->runlist.rl);
 		if (IS_ERR(nrl)) {
-			ntfs_error(sb, "decompress_mapping_pairs() failed with "
-					"error code %ld. $MFT is corrupt.",
-					PTR_ERR(nrl));
+			ntfs_error(sb, "ntfs_mapping_pairs_decompress() "
+					"failed with error code %ld.  $MFT is "
+					"corrupt.", PTR_ERR(nrl));
 			goto put_err_out;
 		}
 		ni->runlist.rl = nrl;
@@ -2024,8 +2014,6 @@ int ntfs_read_inode_mount(struct inode *vi)
 			/* No VFS initiated operations allowed for $MFT. */
 			vi->i_op = &ntfs_empty_inode_ops;
 			vi->i_fop = &ntfs_empty_file_ops;
-			/* Put back our special address space operations. */
-			vi->i_mapping->a_ops = &ntfs_mft_aops;
 		}
 
 		/* Get the lowest vcn for the next extent. */
@@ -2091,37 +2079,24 @@ err_out:
  * dropped, we need to put the attribute inode for the directory index bitmap,
  * if it is present, otherwise the directory inode would remain pinned for
  * ever.
- *
- * If the inode @vi is an index inode with only one reference which is being
- * dropped, we need to put the attribute inode for the index bitmap, if it is
- * present, otherwise the index inode would disappear and the attribute inode
- * for the index bitmap would no longer be referenced from anywhere and thus it
- * would remain pinned for ever.
  */
 void ntfs_put_inode(struct inode *vi)
 {
-	ntfs_inode *ni;
-
-	if (S_ISDIR(vi->i_mode)) {
-		if (atomic_read(&vi->i_count) == 2) {
-			ni = NTFS_I(vi);
-			if (NInoIndexAllocPresent(ni) &&
-					ni->itype.index.bmp_ino) {
-				iput(ni->itype.index.bmp_ino);
-				ni->itype.index.bmp_ino = NULL;
+	if (S_ISDIR(vi->i_mode) && atomic_read(&vi->i_count) == 2) {
+		ntfs_inode *ni = NTFS_I(vi);
+		if (NInoIndexAllocPresent(ni)) {
+			struct inode *bvi = NULL;
+			down(&vi->i_sem);
+			if (atomic_read(&vi->i_count) == 2) {
+				bvi = ni->itype.index.bmp_ino;
+				if (bvi)
+					ni->itype.index.bmp_ino = NULL;
 			}
+			up(&vi->i_sem);
+			if (bvi)
+				iput(bvi);
 		}
-		return;
 	}
-	if (atomic_read(&vi->i_count) != 1)
-		return;
-	ni = NTFS_I(vi);
-	if (NInoAttr(ni) && (ni->type == AT_INDEX_ALLOCATION) &&
-			NInoIndexAllocPresent(ni) && ni->itype.index.bmp_ino) {
-		iput(ni->itype.index.bmp_ino);
-		ni->itype.index.bmp_ino = NULL;
-	}
-	return;
 }
 
 void __ntfs_clear_inode(ntfs_inode *ni)
@@ -2189,6 +2164,18 @@ void ntfs_clear_big_inode(struct inode *vi)
 {
 	ntfs_inode *ni = NTFS_I(vi);
 
+	/*
+	 * If the inode @vi is an index inode we need to put the attribute
+	 * inode for the index bitmap, if it is present, otherwise the index
+	 * inode would disappear and the attribute inode for the index bitmap
+	 * would no longer be referenced from anywhere and thus it would remain
+	 * pinned for ever.
+	 */
+	if (NInoAttr(ni) && (ni->type == AT_INDEX_ALLOCATION) &&
+			NInoIndexAllocPresent(ni) && ni->itype.index.bmp_ino) {
+		iput(ni->itype.index.bmp_ino);
+		ni->itype.index.bmp_ino = NULL;
+	}
 #ifdef NTFS_RW
 	if (NInoDirty(ni)) {
 		BOOL was_bad = (is_bad_inode(vi));
@@ -2268,7 +2255,7 @@ int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt)
  * ntfs_truncate - called when the i_size of an ntfs inode is changed
  * @vi:		inode for which the i_size was changed
  *
- * We don't support i_size changes yet.
+ * We do not support i_size changes yet.
  *
  * The kernel guarantees that @vi is a regular file (S_ISREG() is true) and
  * that the change is allowed.
@@ -2289,6 +2276,8 @@ void ntfs_truncate(struct inode *vi)
 	MFT_RECORD *m;
 	int err;
 
+	BUG_ON(NInoAttr(ni));
+	BUG_ON(ni->nr_extents < 0);
 	m = map_mft_record(ni);
 	if (IS_ERR(m)) {
 		ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx "
@@ -2500,9 +2489,16 @@ int ntfs_write_inode(struct inode *vi, int sync)
 	 * dirty, since we are going to write this mft record below in any case
 	 * and the base mft record may actually not have been modified so it
 	 * might not need to be written out.
+	 * NOTE: It is not a problem when the inode for $MFT itself is being
+	 * written out as mark_ntfs_record_dirty() will only set I_DIRTY_PAGES
+	 * on the $MFT inode and hence ntfs_write_inode() will not be
+	 * re-invoked because of it which in turn is ok since the dirtied mft
+	 * record will be cleaned and written out to disk below, i.e. before
+	 * this function returns.
 	 */
 	if (modified && !NInoTestSetDirty(ctx->ntfs_ino))
-		__set_page_dirty_nobuffers(ctx->ntfs_ino->page);
+		mark_ntfs_record_dirty(ctx->ntfs_ino->page,
+				ctx->ntfs_ino->page_ofs);
 	ntfs_attr_put_search_ctx(ctx);
 	/* Now the access times are updated, write the base mft record. */
 	if (NInoDirty(ni))
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index 3aa7b873fe0d..eb54db217e87 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -24,10 +24,18 @@
 #ifndef _LINUX_NTFS_INODE_H
 #define _LINUX_NTFS_INODE_H
 
+#include <linux/mm.h>
+#include <linux/fs.h>
 #include <linux/seq_file.h>
+#include <linux/list.h>
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
 
 #include "layout.h"
 #include "volume.h"
+#include "types.h"
+#include "runlist.h"
+#include "debug.h"
 
 typedef struct _ntfs_inode ntfs_inode;
 
@@ -269,6 +277,17 @@ extern struct inode *ntfs_alloc_big_inode(struct super_block *sb);
 extern void ntfs_destroy_big_inode(struct inode *inode);
 extern void ntfs_clear_big_inode(struct inode *vi);
 
+extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni);
+
+static inline void ntfs_init_big_inode(struct inode *vi)
+{
+	ntfs_inode *ni = NTFS_I(vi);
+
+	ntfs_debug("Entering.");
+	__ntfs_init_inode(vi->i_sb, ni);
+	ni->mft_no = vi->i_ino;
+}
+
 extern ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
 		unsigned long mft_no);
 extern void ntfs_clear_extent_inode(ntfs_inode *ni);
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 57f3a8893f17..a34eed5b922d 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -260,7 +260,7 @@ typedef enum {
 enum {
 	MFT_RECORD_IN_USE	= const_cpu_to_le16(0x0001),
 	MFT_RECORD_IS_DIRECTORY = const_cpu_to_le16(0x0002),
-};
+} __attribute__ ((__packed__));
 
 typedef le16 MFT_RECORD_FLAGS;
 
@@ -316,6 +316,10 @@ typedef enum {
 typedef u64 MFT_REF;
 typedef le64 leMFT_REF;
 
+#define MK_MREF(m, s)	((MFT_REF)(((MFT_REF)(s) << 48) |		\
+					((MFT_REF)(m) & MFT_REF_MASK_CPU)))
+#define MK_LE_MREF(m, s) cpu_to_le64(MK_MREF(m, s))
+
 #define MREF(x)		((unsigned long)((x) & MFT_REF_MASK_CPU))
 #define MSEQNO(x)	((u16)(((x) >> 48) & 0xffff))
 #define MREF_LE(x)	((unsigned long)(le64_to_cpu(x) & MFT_REF_MASK_CPU))
@@ -385,22 +389,87 @@ typedef struct {
 				   NOTE: Every time the mft record is reused
 				   this number is set to zero.  NOTE: The first
 				   instance number is always 0. */
-/* sizeof() = 42 bytes */
-/* NTFS 3.1+ (Windows XP and above) introduce the following additions. */
-/* 42*/ //le16 reserved;	/* Reserved/alignment. */
-/* 44*/ //le32 mft_record_number;/* Number of this mft record. */
+/* The below fields are specific to NTFS 3.1+ (Windows XP and above): */
+/* 42*/ le16 reserved;		/* Reserved/alignment. */
+/* 44*/ le32 mft_record_number;	/* Number of this mft record. */
 /* sizeof() = 48 bytes */
 /*
  * When (re)using the mft record, we place the update sequence array at this
- * offset, i.e. before we start with the attributes. This also makes sense,
+ * offset, i.e. before we start with the attributes.  This also makes sense,
  * otherwise we could run into problems with the update sequence array
  * containing in itself the last two bytes of a sector which would mean that
- * multi sector transfer protection wouldn't work. As you can't protect data
+ * multi sector transfer protection wouldn't work.  As you can't protect data
  * by overwriting it since you then can't get it back...
  * When reading we obviously use the data from the ntfs record header.
  */
 } __attribute__ ((__packed__)) MFT_RECORD;
 
+/* This is the version without the NTFS 3.1+ specific fields. */
+typedef struct {
+/*Ofs*/
+/*  0	NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
+	NTFS_RECORD_TYPE magic;	/* Usually the magic is "FILE". */
+	le16 usa_ofs;		/* See NTFS_RECORD definition above. */
+	le16 usa_count;		/* See NTFS_RECORD definition above. */
+
+/*  8*/	le64 lsn;		/* $LogFile sequence number for this record.
+				   Changed every time the record is modified. */
+/* 16*/	le16 sequence_number;	/* Number of times this mft record has been
+				   reused. (See description for MFT_REF
+				   above.) NOTE: The increment (skipping zero)
+				   is done when the file is deleted. NOTE: If
+				   this is zero it is left zero. */
+/* 18*/	le16 link_count;	/* Number of hard links, i.e. the number of
+				   directory entries referencing this record.
+				   NOTE: Only used in mft base records.
+				   NOTE: When deleting a directory entry we
+				   check the link_count and if it is 1 we
+				   delete the file. Otherwise we delete the
+				   FILE_NAME_ATTR being referenced by the
+				   directory entry from the mft record and
+				   decrement the link_count.
+				   FIXME: Careful with Win32 + DOS names! */
+/* 20*/	le16 attrs_offset;	/* Byte offset to the first attribute in this
+				   mft record from the start of the mft record.
+				   NOTE: Must be aligned to 8-byte boundary. */
+/* 22*/	MFT_RECORD_FLAGS flags;	/* Bit array of MFT_RECORD_FLAGS. When a file
+				   is deleted, the MFT_RECORD_IN_USE flag is
+				   set to zero. */
+/* 24*/	le32 bytes_in_use;	/* Number of bytes used in this mft record.
+				   NOTE: Must be aligned to 8-byte boundary. */
+/* 28*/	le32 bytes_allocated;	/* Number of bytes allocated for this mft
+				   record. This should be equal to the mft
+				   record size. */
+/* 32*/	leMFT_REF base_mft_record;/* This is zero for base mft records.
+				   When it is not zero it is a mft reference
+				   pointing to the base mft record to which
+				   this record belongs (this is then used to
+				   locate the attribute list attribute present
+				   in the base record which describes this
+				   extension record and hence might need
+				   modification when the extension record
+				   itself is modified, also locating the
+				   attribute list also means finding the other
+				   potential extents, belonging to the non-base
+				   mft record). */
+/* 40*/	le16 next_attr_instance;/* The instance number that will be assigned to
+				   the next attribute added to this mft record.
+				   NOTE: Incremented each time after it is used.
+				   NOTE: Every time the mft record is reused
+				   this number is set to zero.  NOTE: The first
+				   instance number is always 0. */
+/* sizeof() = 42 bytes */
+/*
+ * When (re)using the mft record, we place the update sequence array at this
+ * offset, i.e. before we start with the attributes.  This also makes sense,
+ * otherwise we could run into problems with the update sequence array
+ * containing in itself the last two bytes of a sector which would mean that
+ * multi sector transfer protection wouldn't work.  As you can't protect data
+ * by overwriting it since you then can't get it back...
+ * When reading we obviously use the data from the ntfs record header.
+ */
+} __attribute__ ((__packed__)) MFT_RECORD_OLD;
+
 /*
  * System defined attributes (32-bit).  Each attribute type has a corresponding
  * attribute name (Unicode string of maximum 64 character length) as described
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
index 748ed0d78d3b..17888c9f02f1 100644
--- a/fs/ntfs/lcnalloc.c
+++ b/fs/ntfs/lcnalloc.c
@@ -30,6 +30,7 @@
 #include "volume.h"
 #include "attrib.h"
 #include "malloc.h"
+#include "aops.h"
 #include "ntfs.h"
 
 /**
@@ -46,7 +47,7 @@
  * Locking: - The volume lcn bitmap must be locked for writing on entry and is
  *	      left locked on return.
  */
-static int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
+int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
 		const runlist_element *rl)
 {
 	struct inode *lcnbmp_vi = vol->lcnbmp_ino;
@@ -855,7 +856,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 		err = PTR_ERR(rl);
 		goto err_out;
 	}
-	if (unlikely(rl->lcn < (LCN)LCN_HOLE)) {
+	if (unlikely(rl->lcn < LCN_HOLE)) {
 		if (!is_rollback)
 			ntfs_error(vol->sb, "First runlist element has "
 					"invalid lcn, aborting.");
@@ -895,7 +896,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 	 * free them.
 	 */
 	for (; rl->length && count != 0; ++rl) {
-		if (unlikely(rl->lcn < (LCN)LCN_HOLE)) {
+		if (unlikely(rl->lcn < LCN_HOLE)) {
 			VCN vcn;
 
 			/*
@@ -926,7 +927,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 							"element.");
 				goto err_out;
 			}
-			if (unlikely(rl->lcn < (LCN)LCN_HOLE)) {
+			if (unlikely(rl->lcn < LCN_HOLE)) {
 				if (!is_rollback)
 					ntfs_error(vol->sb, "Runlist element "
 							"has invalid lcn "
diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h
index f9292e882adc..4cac1c024af6 100644
--- a/fs/ntfs/lcnalloc.h
+++ b/fs/ntfs/lcnalloc.h
@@ -28,6 +28,7 @@
 #include <linux/fs.h>
 
 #include "types.h"
+#include "runlist.h"
 #include "volume.h"
 
 typedef enum {
@@ -78,6 +79,34 @@ static inline s64 ntfs_cluster_free(struct inode *vi, const VCN start_vcn,
 	return __ntfs_cluster_free(vi, start_vcn, count, FALSE);
 }
 
+extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
+		const runlist_element *rl);
+
+/**
+ * ntfs_cluster_free_from_rl - free clusters from runlist
+ * @vol:	mounted ntfs volume on which to free the clusters
+ * @rl:		runlist describing the clusters to free
+ *
+ * Free all the clusters described by the runlist @rl on the volume @vol.  In
+ * the case of an error being returned, at least some of the clusters were not
+ * freed.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: This function takes the volume lcn bitmap lock for writing and
+ *	    modifies the bitmap contents.
+ */
+static inline int ntfs_cluster_free_from_rl(ntfs_volume *vol,
+		const runlist_element *rl)
+{
+	int ret;
+
+	down_write(&vol->lcnbmp_lock);
+	ret = ntfs_cluster_free_from_rl_nolock(vol, rl);
+	up_write(&vol->lcnbmp_lock);
+	return ret;
+}
+
 #endif /* NTFS_RW */
 
 #endif /* defined _LINUX_NTFS_LCNALLOC_H */
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index 1869a4375898..5e280abafab3 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -27,10 +27,13 @@
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
 
+#include "attrib.h"
+#include "aops.h"
+#include "debug.h"
 #include "logfile.h"
+#include "malloc.h"
 #include "volume.h"
 #include "ntfs.h"
-#include "debug.h"
 
 /**
  * ntfs_check_restart_page_header - check the page header for consistency
@@ -681,60 +684,20 @@ err_out:
 BOOL ntfs_empty_logfile(struct inode *log_vi)
 {
 	ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
-	struct address_space *mapping;
-	pgoff_t idx, end;
 
 	ntfs_debug("Entering.");
-	if (NVolLogFileEmpty(vol))
-		goto done;
-	mapping = log_vi->i_mapping;
-	end = (log_vi->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	for (idx = 0; idx < end; ++idx) {
-		struct page *page;
-		u8 *kaddr;
-
-		/* Find or create the current page.  (The page is locked.) */
-		page = grab_cache_page(mapping, idx);
-		if (unlikely(!page)) {
-			ntfs_error(vol->sb, "Insufficient memory to grab "
-					"$LogFile page (index %lu).", idx);
+	if (!NVolLogFileEmpty(vol)) {
+		int err;
+		
+		err = ntfs_attr_set(NTFS_I(log_vi), 0, log_vi->i_size, 0xff);
+		if (unlikely(err)) {
+			ntfs_error(vol->sb, "Failed to fill $LogFile with "
+					"0xff bytes (error code %i).", err);
 			return FALSE;
 		}
-		/*
-		 * Set all bytes in the page to 0xff.  It doesn't matter if we
-		 * go beyond i_size, because ntfs_writepage() will take care of
-		 * that for us.
-		 */
-		kaddr = (u8*)kmap_atomic(page, KM_USER0);
-		memset(kaddr, 0xff, PAGE_CACHE_SIZE);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
-		/*
-		 * If the page has buffers, mark them uptodate since buffer
-		 * state and not page state is definitive in 2.6 kernels.
-		 */
-		if (page_has_buffers(page)) {
-			struct buffer_head *bh, *head;
-
-			bh = head = page_buffers(page);
-			do {
-				set_buffer_uptodate(bh);
-			} while ((bh = bh->b_this_page) != head);
-		}
-		/* Now that buffers are uptodate, set the page uptodate, too. */
-		SetPageUptodate(page);
-		/*
-		 * Set the page and all its buffers dirty and mark the inode
-		 * dirty, too. The VM will write the page later on.
-		 */
-		set_page_dirty(page);
-		/* Finally unlock and release the page. */
-		unlock_page(page);
-		page_cache_release(page);
-	}
-	/* We set the flag so we do not clear the log file again on remount. */
-	NVolSetLogFileEmpty(vol);
-done:
+		/* Set the flag so we do not have to do it again on remount. */
+		NVolSetLogFileEmpty(vol);
+	}
 	ntfs_debug("Done.");
 	return TRUE;
 }
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index c8548a5336e0..fac5944df6d8 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -24,6 +24,7 @@
 
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
+#include <linux/highmem.h>
 
 /**
  * ntfs_malloc_nofs - allocate memory in multiples of pages
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 9192bbdf2c2e..982cf8e37ba6 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -20,115 +20,20 @@
  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include <linux/buffer_head.h>
 #include <linux/swap.h>
 
+#include "attrib.h"
+#include "aops.h"
+#include "bitmap.h"
+#include "debug.h"
+#include "dir.h"
+#include "lcnalloc.h"
+#include "malloc.h"
+#include "mft.h"
 #include "ntfs.h"
 
 /**
- * __format_mft_record - initialize an empty mft record
- * @m:		mapped, pinned and locked for writing mft record
- * @size:	size of the mft record
- * @rec_no:	mft record number / inode number
- *
- * Private function to initialize an empty mft record. Use one of the two
- * provided format_mft_record() functions instead.
- */
-static void __format_mft_record(MFT_RECORD *m, const int size,
-		const unsigned long rec_no)
-{
-	ATTR_RECORD *a;
-
-	memset(m, 0, size);
-	m->magic = magic_FILE;
-	/* Aligned to 2-byte boundary. */
-	m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1);
-	m->usa_count = cpu_to_le16(size / NTFS_BLOCK_SIZE + 1);
-	/* Set the update sequence number to 1. */
-	*(le16*)((char*)m + ((sizeof(MFT_RECORD) + 1) & ~1)) = cpu_to_le16(1);
-	m->lsn = cpu_to_le64(0LL);
-	m->sequence_number = cpu_to_le16(1);
-	m->link_count = 0;
-	/* Aligned to 8-byte boundary. */
-	m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) +
-			(le16_to_cpu(m->usa_count) << 1) + 7) & ~7);
-	m->flags = 0;
-	/*
-	 * Using attrs_offset plus eight bytes (for the termination attribute),
-	 * aligned to 8-byte boundary.
-	 */
-	m->bytes_in_use = cpu_to_le32((le16_to_cpu(m->attrs_offset) + 8 + 7) &
-			~7);
-	m->bytes_allocated = cpu_to_le32(size);
-	m->base_mft_record = cpu_to_le64((MFT_REF)0);
-	m->next_attr_instance = 0;
-	a = (ATTR_RECORD*)((char*)m + le16_to_cpu(m->attrs_offset));
-	a->type = AT_END;
-	a->length = 0;
-}
-
-/**
- * format_mft_record - initialize an empty mft record
- * @ni:		ntfs inode of mft record
- * @mft_rec:	mapped, pinned and locked mft record (optional)
- *
- * Initialize an empty mft record. This is used when extending the MFT.
- *
- * If @mft_rec is NULL, we call map_mft_record() to obtain the
- * record and we unmap it again when finished.
- *
- * We return 0 on success or -errno on error.
- */
-int format_mft_record(ntfs_inode *ni, MFT_RECORD *mft_rec)
-{
-	MFT_RECORD *m;
-
-	if (mft_rec)
-		m = mft_rec;
-	else {
-		m = map_mft_record(ni);
-		if (IS_ERR(m))
-			return PTR_ERR(m);
-	}
-	__format_mft_record(m, ni->vol->mft_record_size, ni->mft_no);
-	if (!mft_rec) {
-		// FIXME: Need to set the mft record dirty!
-		unmap_mft_record(ni);
-	}
-	return 0;
-}
-
-/**
- * ntfs_readpage - external declaration, function is in fs/ntfs/aops.c
- */
-extern int ntfs_readpage(struct file *, struct page *);
-
-#ifdef NTFS_RW
-/**
- * ntfs_mft_writepage - forward declaration, function is further below
- */
-static int ntfs_mft_writepage(struct page *page, struct writeback_control *wbc);
-#endif /* NTFS_RW */
-
-/**
- * ntfs_mft_aops - address space operations for access to $MFT
- *
- * Address space operations for access to $MFT. This allows us to simply use
- * ntfs_map_page() in map_mft_record_page().
- */
-struct address_space_operations ntfs_mft_aops = {
-	.readpage	= ntfs_readpage,	/* Fill page with data. */
-	.sync_page	= block_sync_page,	/* Currently, just unplugs the
-						   disk request queue. */
-#ifdef NTFS_RW
-	.writepage	= ntfs_mft_writepage,	/* Write out the dirty mft
-						   records in a page. */
-	.set_page_dirty	= __set_page_dirty_nobuffers,	/* Set the page dirty
-						   without touching the buffers
-						   belonging to the page. */
-#endif /* NTFS_RW */
-};
-
-/**
  * map_mft_record_page - map the page in which a specific mft record resides
  * @ni:		ntfs inode whose mft record page to map
  *
@@ -397,8 +302,8 @@ map_err_out:
 		ntfs_clear_extent_inode(ni);
 		goto map_err_out;
 	}
-	/* Verify the sequence number. */
-	if (unlikely(le16_to_cpu(m->sequence_number) != seq_no)) {
+	/* Verify the sequence number if it is present. */
+	if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) {
 		ntfs_error(base_ni->vol->sb, "Found stale extent mft "
 				"reference! Corrupt file system. Run chkdsk.");
 		destroy_ni = TRUE;
@@ -473,19 +378,11 @@ unm_err_out:
  */
 void __mark_mft_record_dirty(ntfs_inode *ni)
 {
-	struct page *page = ni->page;
 	ntfs_inode *base_ni;
 
 	ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
-	BUG_ON(!page);
 	BUG_ON(NInoAttr(ni));
-
-	/*
-	 * Set the page containing the mft record dirty.  This also marks the
-	 * $MFT inode dirty (I_DIRTY_PAGES).
-	 */
-	__set_page_dirty_nobuffers(page);
-
+	mark_ntfs_record_dirty(ni->page, ni->page_ofs);
 	/* Determine the base vfs inode and mark it dirty, too. */
 	down(&ni->extent_lock);
 	if (likely(ni->nr_extents >= 0))
@@ -501,13 +398,14 @@ static const char *ntfs_please_email = "Please email "
 		"this message.  Thank you.";
 
 /**
- * sync_mft_mirror_umount - synchronise an mft record to the mft mirror
- * @ni:		ntfs inode whose mft record to synchronize
+ * ntfs_sync_mft_mirror_umount - synchronise an mft record to the mft mirror
+ * @vol:	ntfs volume on which the mft record to synchronize resides
+ * @mft_no:	mft record number of mft record to synchronize
  * @m:		mapped, mst protected (extent) mft record to synchronize
  *
- * Write the mapped, mst protected (extent) mft record @m described by the
- * (regular or extent) ntfs inode @ni to the mft mirror ($MFTMirr) bypassing
- * the page cache and the $MFTMirr inode itself.
+ * Write the mapped, mst protected (extent) mft record @m with mft record
+ * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol,
+ * bypassing the page cache and the $MFTMirr inode itself.
  *
  * This function is only for use at umount time when the mft mirror inode has
  * already been disposed off.  We BUG() if we are called while the mft mirror
@@ -521,10 +419,9 @@ static const char *ntfs_please_email = "Please email "
  * alternative would be either to BUG() or to get a NULL pointer dereference
  * and Oops.
  */
-static int sync_mft_mirror_umount(ntfs_inode *ni, MFT_RECORD *m)
+static int ntfs_sync_mft_mirror_umount(ntfs_volume *vol,
+		const unsigned long mft_no, MFT_RECORD *m)
 {
-	ntfs_volume *vol = ni->vol;
-
 	BUG_ON(vol->mftmirr_ino);
 	ntfs_error(vol->sb, "Umount time mft mirror syncing is not "
 			"implemented yet.  %s", ntfs_please_email);
@@ -532,25 +429,26 @@ static int sync_mft_mirror_umount(ntfs_inode *ni, MFT_RECORD *m)
 }
 
 /**
- * sync_mft_mirror - synchronize an mft record to the mft mirror
- * @ni:		ntfs inode whose mft record to synchronize
+ * ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror
+ * @vol:	ntfs volume on which the mft record to synchronize resides
+ * @mft_no:	mft record number of mft record to synchronize
  * @m:		mapped, mst protected (extent) mft record to synchronize
  * @sync:	if true, wait for i/o completion
  *
- * Write the mapped, mst protected (extent) mft record @m described by the
- * (regular or extent) ntfs inode @ni to the mft mirror ($MFTMirr).
+ * Write the mapped, mst protected (extent) mft record @m with mft record
+ * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol.
  *
  * On success return 0.  On error return -errno and set the volume errors flag
- * in the ntfs_volume to which @ni belongs.
+ * in the ntfs volume @vol.
  *
  * NOTE:  We always perform synchronous i/o and ignore the @sync parameter.
  *
  * TODO:  If @sync is false, want to do truly asynchronous i/o, i.e. just
  * schedule i/o via ->writepage or do it via kntfsd or whatever.
  */
-static int sync_mft_mirror(ntfs_inode *ni, MFT_RECORD *m, int sync)
+int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
+		MFT_RECORD *m, int sync)
 {
-	ntfs_volume *vol = ni->vol;
 	struct page *page;
 	unsigned int blocksize = vol->sb->s_blocksize;
 	int max_bhs = vol->mft_record_size / blocksize;
@@ -560,17 +458,17 @@ static int sync_mft_mirror(ntfs_inode *ni, MFT_RECORD *m, int sync)
 	unsigned int block_start, block_end, m_start, m_end;
 	int i_bhs, nr_bhs, err = 0;
 
-	ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
+	ntfs_debug("Entering for inode 0x%lx.", mft_no);
 	BUG_ON(!max_bhs);
 	if (unlikely(!vol->mftmirr_ino)) {
 		/* This could happen during umount... */
-		err = sync_mft_mirror_umount(ni, m);
+		err = ntfs_sync_mft_mirror_umount(vol, mft_no, m);
 		if (likely(!err))
 			return err;
 		goto err_out;
 	}
 	/* Get the page containing the mirror copy of the mft record @m. */
-	page = ntfs_map_page(vol->mftmirr_ino->i_mapping, ni->mft_no >>
+	page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >>
 			(PAGE_CACHE_SHIFT - vol->mft_record_size_bits));
 	if (IS_ERR(page)) {
 		ntfs_error(vol->sb, "Failed to map mft mirror page.");
@@ -584,59 +482,28 @@ static int sync_mft_mirror(ntfs_inode *ni, MFT_RECORD *m, int sync)
 	 * make sure no one is writing from elsewhere.
 	 */
 	lock_page(page);
+	BUG_ON(!PageUptodate(page));
+	ClearPageUptodate(page);
 	/* The address in the page of the mirror copy of the mft record @m. */
-	kmirr = page_address(page) + ((ni->mft_no << vol->mft_record_size_bits)
-			& ~PAGE_CACHE_MASK);
+	kmirr = page_address(page) + ((mft_no << vol->mft_record_size_bits) &
+			~PAGE_CACHE_MASK);
 	/* Copy the mst protected mft record to the mirror. */
 	memcpy(kmirr, m, vol->mft_record_size);
 	/* Make sure we have mapped buffers. */
-	if (!page_has_buffers(page)) {
-no_buffers_err_out:
-		ntfs_error(vol->sb, "Writing mft mirror records without "
-				"existing buffers is not implemented yet.  %s",
-				ntfs_please_email);
-		err = -EOPNOTSUPP;
-		goto unlock_err_out;
-	}
+	BUG_ON(!page_has_buffers(page));
 	bh = head = page_buffers(page);
-	if (!bh)
-		goto no_buffers_err_out;
+	BUG_ON(!bh);
 	nr_bhs = 0;
 	block_start = 0;
 	m_start = kmirr - (u8*)page_address(page);
 	m_end = m_start + vol->mft_record_size;
 	do {
 		block_end = block_start + blocksize;
-		/*
-		 * If the buffer is outside the mft record, just skip it,
-		 * clearing it if it is dirty to make sure it is not written
-		 * out.  It should never be marked dirty but better be safe.
-		 */
-		if ((block_end <= m_start) || (block_start >= m_end)) {
-			if (buffer_dirty(bh)) {
-				ntfs_warning(vol->sb, "Clearing dirty mft "
-						"record page buffer.  %s",
-						ntfs_please_email);
-				clear_buffer_dirty(bh);
-			}
-			continue;
-		}
-		if (!buffer_mapped(bh)) {
-			ntfs_error(vol->sb, "Writing mft mirror records "
-					"without existing mapped buffers is "
-					"not implemented yet.  %s",
-					ntfs_please_email);
-			err = -EOPNOTSUPP;
-			continue;
-		}
-		if (!buffer_uptodate(bh)) {
-			ntfs_error(vol->sb, "Writing mft mirror records "
-					"without existing uptodate buffers is "
-					"not implemented yet.  %s",
-					ntfs_please_email);
-			err = -EOPNOTSUPP;
+		/* If the buffer is outside the mft record, skip it. */
+		if ((block_end <= m_start) || (block_start >= m_end))
 			continue;
-		}
+		BUG_ON(!buffer_mapped(bh));
+		BUG_ON(!buffer_uptodate(bh));
 		BUG_ON(!nr_bhs && (m_start != block_start));
 		BUG_ON(nr_bhs >= max_bhs);
 		bhs[nr_bhs++] = bh;
@@ -664,11 +531,10 @@ no_buffers_err_out:
 			if (unlikely(!buffer_uptodate(tbh))) {
 				err = -EIO;
 				/*
-				 * Set the buffer uptodate so the page & buffer
-				 * states don't become out of sync.
+				 * Set the buffer uptodate so the page and
+				 * buffer states do not become out of sync.
 				 */
-				if (PageUptodate(page))
-					set_buffer_uptodate(tbh);
+				set_buffer_uptodate(tbh);
 			}
 		}
 	} else /* if (unlikely(err)) */ {
@@ -676,29 +542,25 @@ no_buffers_err_out:
 		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
 			clear_buffer_dirty(bhs[i_bhs]);
 	}
-unlock_err_out:
 	/* Current state: all buffers are clean, unlocked, and uptodate. */
 	/* Remove the mst protection fixups again. */
 	post_write_mst_fixup((NTFS_RECORD*)kmirr);
 	flush_dcache_page(page);
+	SetPageUptodate(page);
 	unlock_page(page);
 	ntfs_unmap_page(page);
-	if (unlikely(err)) {
-		/* I/O error during writing.  This is really bad! */
+	if (likely(!err)) {
+		ntfs_debug("Done.");
+	} else {
 		ntfs_error(vol->sb, "I/O error while writing mft mirror "
-				"record 0x%lx!  You should unmount the volume "
-				"and run chkdsk or ntfsfix.", ni->mft_no);
-		goto err_out;
-	}
-	ntfs_debug("Done.");
-	return 0;
+				"record 0x%lx!", mft_no);
 err_out:
-	ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error code %i).  "
-			"Volume will be left marked dirty on umount.  Run "
-			"ntfsfix on the partition after umounting to correct "
-			"this.", -err);
-	/* We don't want to clear the dirty bit on umount. */
-	NVolSetErrors(vol);
+		ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error "
+				"code %i).  Volume will be left marked dirty "
+				"on umount.  Run ntfsfix on the partition "
+				"after umounting to correct this.", -err);
+		NVolSetErrors(vol);
+	}
 	return err;
 }
 
@@ -712,6 +574,11 @@ err_out:
  * ntfs inode @ni to backing store.  If the mft record @m has a counterpart in
  * the mft mirror, that is also updated.
  *
+ * We only write the mft record if the ntfs inode @ni is dirty and the first
+ * buffer belonging to its mft record is dirty, too.  We ignore the dirty state
+ * of subsequent buffers because we could have raced with
+ * fs/ntfs/aops.c::mark_ntfs_record_dirty().
+ *
  * On success, clean the mft record and return 0.  On error, leave the mft
  * record dirty and return -errno.  The caller should call make_bad_inode() on
  * the base inode to ensure no more access happens to this inode.  We do not do
@@ -741,6 +608,7 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
 	struct buffer_head *bh, *head;
 	unsigned int block_start, block_end, m_start, m_end;
 	int i_bhs, nr_bhs, err = 0;
+	BOOL rec_is_dirty = TRUE;
 
 	ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
 	BUG_ON(NInoAttr(ni));
@@ -754,59 +622,46 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
 	 */
 	if (!NInoTestClearDirty(ni))
 		goto done;
-	/* Make sure we have mapped buffers. */
-	if (!page_has_buffers(page)) {
-no_buffers_err_out:
-		ntfs_error(vol->sb, "Writing mft records without existing "
-				"buffers is not implemented yet.  %s",
-				ntfs_please_email);
-		err = -EOPNOTSUPP;
-		goto err_out;
-	}
+	BUG_ON(!page_has_buffers(page));
 	bh = head = page_buffers(page);
-	if (!bh)
-		goto no_buffers_err_out;
+	BUG_ON(!bh);
 	nr_bhs = 0;
 	block_start = 0;
 	m_start = ni->page_ofs;
 	m_end = m_start + vol->mft_record_size;
 	do {
 		block_end = block_start + blocksize;
-		/*
-		 * If the buffer is outside the mft record, just skip it,
-		 * clearing it if it is dirty to make sure it is not written
-		 * out.  It should never be marked dirty but better be safe.
-		 */
-		if ((block_end <= m_start) || (block_start >= m_end)) {
-			if (buffer_dirty(bh)) {
-				ntfs_warning(vol->sb, "Clearing dirty mft "
-						"record page buffer.  %s",
-						ntfs_please_email);
-				clear_buffer_dirty(bh);
-			}
-			continue;
-		}
-		if (!buffer_mapped(bh)) {
-			ntfs_error(vol->sb, "Writing mft records without "
-					"existing mapped buffers is not "
-					"implemented yet.  %s",
-					ntfs_please_email);
-			err = -EOPNOTSUPP;
-			continue;
-		}
-		if (!buffer_uptodate(bh)) {
-			ntfs_error(vol->sb, "Writing mft records without "
-					"existing uptodate buffers is not "
-					"implemented yet.  %s",
-					ntfs_please_email);
-			err = -EOPNOTSUPP;
+		/* If the buffer is outside the mft record, skip it. */
+		if (block_end <= m_start)
 			continue;
+		if (unlikely(block_start >= m_end))
+			break;
+		if (block_start == m_start) {
+			/* This block is the first one in the record. */
+			if (!buffer_dirty(bh)) {
+				/* Clean records are not written out. */
+				rec_is_dirty = FALSE;
+				continue;
+			}
+			rec_is_dirty = TRUE;
+		} else {
+			/*
+			 * This block is not the first one in the record.  We
+			 * ignore the buffer's dirty state because we could
+			 * have raced with a parallel mark_ntfs_record_dirty().
+			 */
+			if (!rec_is_dirty)
+				continue;
 		}
+		BUG_ON(!buffer_mapped(bh));
+		BUG_ON(!buffer_uptodate(bh));
 		BUG_ON(!nr_bhs && (m_start != block_start));
 		BUG_ON(nr_bhs >= max_bhs);
 		bhs[nr_bhs++] = bh;
 		BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
 	} while (block_start = block_end, (bh = bh->b_this_page) != head);
+	if (!rec_is_dirty)
+		goto done;
 	if (unlikely(err))
 		goto cleanup_out;
 	/* Apply the mst protection fixups. */
@@ -823,15 +678,14 @@ no_buffers_err_out:
 		if (unlikely(test_set_buffer_locked(tbh)))
 			BUG();
 		BUG_ON(!buffer_uptodate(tbh));
-		if (buffer_dirty(tbh))
-			clear_buffer_dirty(tbh);
+		clear_buffer_dirty(tbh);
 		get_bh(tbh);
 		tbh->b_end_io = end_buffer_write_sync;
 		submit_bh(WRITE, tbh);
 	}
 	/* Synchronize the mft mirror now if not @sync. */
 	if (!sync && ni->mft_no < vol->mftmirr_size)
-		sync_mft_mirror(ni, m, sync);
+		ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
 	/* Wait on i/o completion of buffers. */
 	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
 		struct buffer_head *tbh = bhs[i_bhs];
@@ -840,8 +694,8 @@ no_buffers_err_out:
 		if (unlikely(!buffer_uptodate(tbh))) {
 			err = -EIO;
 			/*
-			 * Set the buffer uptodate so the page & buffer states
-			 * don't become out of sync.
+			 * Set the buffer uptodate so the page and buffer
+			 * states do not become out of sync.
 			 */
 			if (PageUptodate(page))
 				set_buffer_uptodate(tbh);
@@ -849,7 +703,7 @@ no_buffers_err_out:
 	}
 	/* If @sync, now synchronize the mft mirror. */
 	if (sync && ni->mft_no < vol->mftmirr_size)
-		sync_mft_mirror(ni, m, sync);
+		ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
 	/* Remove the mst protection fixups again. */
 	post_write_mst_fixup((NTFS_RECORD*)m);
 	flush_dcache_mft_record_page(ni);
@@ -885,214 +739,1987 @@ err_out:
 }
 
 /**
- * ntfs_mft_writepage - check if a metadata page contains dirty mft records
- * @page:	metadata page possibly containing dirty mft records
- * @wbc:	writeback control structure
- *
- * This is called from the VM when it wants to have a dirty $MFT/$DATA metadata
- * page cache page cleaned.  The VM has already locked the page and marked it
- * clean.  Instead of writing the page as a conventional ->writepage function
- * would do, we check if the page still contains any dirty mft records (it must
- * have done at some point in the past since the page was marked dirty) and if
- * none are found, i.e. all mft records are clean, we unlock the page and
- * return.  The VM is then free to do with the page as it pleases.  If on the
- * other hand we do find any dirty mft records in the page, we redirty the page
- * before unlocking it and returning so the VM knows that the page is still
- * busy and cannot be thrown out.
- *
- * Note, we do not actually write any dirty mft records here because they are
- * dirty inodes and hence will be written by the VFS inode dirty code paths.
- * There is no need to write them from the VM page dirty code paths, too and in
- * fact once we implement journalling it would be a complete nightmare having
- * two code paths leading to mft record writeout.
+ * ntfs_may_write_mft_record - check if an mft record may be written out
+ * @vol:	[IN]  ntfs volume on which the mft record to check resides
+ * @mft_no:	[IN]  mft record number of the mft record to check
+ * @m:		[IN]  mapped mft record to check
+ * @locked_ni:	[OUT] caller has to unlock this ntfs inode if one is returned
+ *
+ * Check if the mapped (base or extent) mft record @m with mft record number
+ * @mft_no belonging to the ntfs volume @vol may be written out.  If necessary
+ * and possible the ntfs inode of the mft record is locked and the base vfs
+ * inode is pinned.  The locked ntfs inode is then returned in @locked_ni.  The
+ * caller is responsible for unlocking the ntfs inode and unpinning the base
+ * vfs inode.
+ *
+ * Return TRUE if the mft record may be written out and FALSE if not.
+ *
+ * The caller has locked the page and cleared the uptodate flag on it which
+ * means that we can safely write out any dirty mft records that do not have
+ * their inodes in icache as determined by ilookup5() as anyone
+ * opening/creating such an inode would block when attempting to map the mft
+ * record in read_cache_page() until we are finished with the write out.
+ *
+ * Here is a description of the tests we perform:
+ *
+ * If the inode is found in icache we know the mft record must be a base mft
+ * record.  If it is dirty, we do not write it and return FALSE as the vfs
+ * inode write paths will result in the access times being updated which would
+ * cause the base mft record to be redirtied and written out again.  (We know
+ * the access time update will modify the base mft record because Windows
+ * chkdsk complains if the standard information attribute is not in the base
+ * mft record.)
+ *
+ * If the inode is in icache and not dirty, we attempt to lock the mft record
+ * and if we find the lock was already taken, it is not safe to write the mft
+ * record and we return FALSE.
+ *
+ * If we manage to obtain the lock we have exclusive access to the mft record,
+ * which also allows us safe writeout of the mft record.  We then set
+ * @locked_ni to the locked ntfs inode and return TRUE.
+ *
+ * Note we cannot just lock the mft record and sleep while waiting for the lock
+ * because this would deadlock due to lock reversal (normally the mft record is
+ * locked before the page is locked but we already have the page locked here
+ * when we try to lock the mft record).
+ *
+ * If the inode is not in icache we need to perform further checks.
+ *
+ * If the mft record is not a FILE record or it is a base mft record, we can
+ * safely write it and return TRUE.
+ *
+ * We now know the mft record is an extent mft record.  We check if the inode
+ * corresponding to its base mft record is in icache and obtain a reference to
+ * it if it is.  If it is not, we can safely write it and return TRUE.
+ *
+ * We now have the base inode for the extent mft record.  We check if it has an
+ * ntfs inode for the extent mft record attached and if not it is safe to write
+ * the extent mft record and we return TRUE.
+ *
+ * The ntfs inode for the extent mft record is attached to the base inode so we
+ * attempt to lock the extent mft record and if we find the lock was already
+ * taken, it is not safe to write the extent mft record and we return FALSE.
+ *
+ * If we manage to obtain the lock we have exclusive access to the extent mft
+ * record, which also allows us safe writeout of the extent mft record.  We
+ * set the ntfs inode of the extent mft record clean and then set @locked_ni to
+ * the now locked ntfs inode and return TRUE.
+ *
+ * Note, the reason for actually writing dirty mft records here and not just
+ * relying on the vfs inode dirty code paths is that we can have mft records
+ * modified without them ever having actual inodes in memory.  Also we can have
+ * dirty mft records with clean ntfs inodes in memory.  None of the described
+ * cases would result in the dirty mft records being written out if we only
+ * relied on the vfs inode dirty code paths.  And these cases can really occur
+ * during allocation of new mft records and in particular when the
+ * initialized_size of the $MFT/$DATA attribute is extended and the new space
+ * is initialized using ntfs_mft_record_format().  The clean inode can then
+ * appear if the mft record is reused for a new inode before it got written
+ * out.
  */
-static int ntfs_mft_writepage(struct page *page, struct writeback_control *wbc)
+BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
+		const MFT_RECORD *m, ntfs_inode **locked_ni)
 {
-	struct inode *mft_vi = page->mapping->host;
-	struct super_block *sb = mft_vi->i_sb;
-	ntfs_volume *vol = NTFS_SB(sb);
-	u8 *maddr;
-	MFT_RECORD *m;
-	ntfs_inode **extent_nis;
-	unsigned long mft_no;
-	int nr, i, j;
-	BOOL is_dirty = FALSE;
+	struct super_block *sb = vol->sb;
+	struct inode *mft_vi = vol->mft_ino;
+	struct inode *vi;
+	ntfs_inode *ni, *eni, **extent_nis;
+	int i;
+	ntfs_attr na;
 
-	BUG_ON(!PageLocked(page));
-	BUG_ON(PageWriteback(page));
-	BUG_ON(mft_vi != vol->mft_ino);
-	/* The first mft record number in the page. */
-	mft_no = page->index << (PAGE_CACHE_SHIFT - vol->mft_record_size_bits);
-	/* Number of mft records in the page. */
-	nr = PAGE_CACHE_SIZE >> vol->mft_record_size_bits;
-	BUG_ON(!nr);
-	ntfs_debug("Entering for %i inodes starting at 0x%lx.", nr, mft_no);
-	/* Iterate over the mft records in the page looking for a dirty one. */
-	maddr = (u8*)kmap(page);
-	for (i = 0; i < nr; ++i, ++mft_no, maddr += vol->mft_record_size) {
-		struct inode *vi;
-		ntfs_inode *ni, *eni;
-		ntfs_attr na;
-
-		na.mft_no = mft_no;
-		na.name = NULL;
-		na.name_len = 0;
-		na.type = AT_UNUSED;
-		/*
-		 * Check if the inode corresponding to this mft record is in
-		 * the VFS inode cache and obtain a reference to it if it is.
-		 */
-		ntfs_debug("Looking for inode 0x%lx in icache.", mft_no);
-		/*
-		 * For inode 0, i.e. $MFT itself, we cannot use ilookup5() from
-		 * here or we deadlock because the inode is already locked by
-		 * the kernel (fs/fs-writeback.c::__sync_single_inode()) and
-		 * ilookup5() waits until the inode is unlocked before
-		 * returning it and it never gets unlocked because
-		 * ntfs_mft_writepage() never returns.  )-:  Fortunately, we
-		 * have inode 0 pinned in icache for the duration of the mount
-		 * so we can access it directly.
-		 */
-		if (!mft_no) {
-			/* Balance the below iput(). */
-			vi = igrab(mft_vi);
-			BUG_ON(vi != mft_vi);
-		} else
-			vi = ilookup5(sb, mft_no, (test_t)ntfs_test_inode, &na);
-		if (vi) {
-			ntfs_debug("Inode 0x%lx is in icache.", mft_no);
-			/* The inode is in icache.  Check if it is dirty. */
-			ni = NTFS_I(vi);
-			if (!NInoDirty(ni)) {
-				/* The inode is not dirty, skip this record. */
-				ntfs_debug("Inode 0x%lx is not dirty, "
-						"continuing search.", mft_no);
-				iput(vi);
-				continue;
-			}
-			ntfs_debug("Inode 0x%lx is dirty, aborting search.",
+	ntfs_debug("Entering for inode 0x%lx.", mft_no);
+	/*
+	 * Normally we do not return a locked inode so set @locked_ni to NULL.
+	 */
+	BUG_ON(!locked_ni);
+	*locked_ni = NULL;
+	/*
+	 * Check if the inode corresponding to this mft record is in the VFS
+	 * inode cache and obtain a reference to it if it is.
+	 */
+	ntfs_debug("Looking for inode 0x%lx in icache.", mft_no);
+	na.mft_no = mft_no;
+	na.name = NULL;
+	na.name_len = 0;
+	na.type = AT_UNUSED;
+	/*
+	 * For inode 0, i.e. $MFT itself, we cannot use ilookup5() from here or
+	 * we deadlock because the inode is already locked by the kernel
+	 * (fs/fs-writeback.c::__sync_single_inode()) and ilookup5() waits
+	 * until the inode is unlocked before returning it and it never gets
+	 * unlocked because ntfs_should_write_mft_record() never returns.  )-:
+	 * Fortunately, we have inode 0 pinned in icache for the duration of
+	 * the mount so we can access it directly.
+	 */
+	if (!mft_no) {
+		/* Balance the below iput(). */
+		vi = igrab(mft_vi);
+		BUG_ON(vi != mft_vi);
+	} else
+		vi = ilookup5(sb, mft_no, (test_t)ntfs_test_inode, &na);
+	if (vi) {
+		ntfs_debug("Base inode 0x%lx is in icache.", mft_no);
+		/* The inode is in icache. */
+		ni = NTFS_I(vi);
+		/* Take a reference to the ntfs inode. */
+		atomic_inc(&ni->count);
+		/* If the inode is dirty, do not write this record. */
+		if (NInoDirty(ni)) {
+			ntfs_debug("Inode 0x%lx is dirty, do not write it.",
 					mft_no);
-			/* The inode is dirty, no need to search further. */
+			atomic_dec(&ni->count);
 			iput(vi);
-			is_dirty = TRUE;
-			break;
+			return FALSE;
 		}
-		ntfs_debug("Inode 0x%lx is not in icache.", mft_no);
-		/* The inode is not in icache. */
-		/* Skip the record if it is not a mft record (type "FILE"). */
-		if (!ntfs_is_mft_recordp((le32*)maddr)) {
-			ntfs_debug("Mft record 0x%lx is not a FILE record, "
-					"continuing search.", mft_no);
-			continue;
+		ntfs_debug("Inode 0x%lx is not dirty.", mft_no);
+		/* The inode is not dirty, try to take the mft record lock. */
+		if (unlikely(down_trylock(&ni->mrec_lock))) {
+			ntfs_debug("Mft record 0x%lx is already locked, do "
+					"not write it.", mft_no);
+			atomic_dec(&ni->count);
+			iput(vi);
+			return FALSE;
 		}
-		m = (MFT_RECORD*)maddr;
+		ntfs_debug("Managed to lock mft record 0x%lx, write it.",
+				mft_no);
 		/*
-		 * Skip the mft record if it is not in use.  FIXME:  What about
-		 * deleted/deallocated (extent) inodes?  (AIA)
+		 * The write has to occur while we hold the mft record lock so
+		 * return the locked ntfs inode.
 		 */
-		if (!(m->flags & MFT_RECORD_IN_USE)) {
-			ntfs_debug("Mft record 0x%lx is not in use, "
-					"continuing search.", mft_no);
-			continue;
-		}
-		/* Skip the mft record if it is a base inode. */
-		if (!m->base_mft_record) {
-			ntfs_debug("Mft record 0x%lx is a base record, "
-					"continuing search.", mft_no);
-			continue;
-		}
+		*locked_ni = ni;
+		return TRUE;
+	}
+	ntfs_debug("Inode 0x%lx is not in icache.", mft_no);
+	/* The inode is not in icache. */
+	/* Write the record if it is not a mft record (type "FILE"). */
+	if (!ntfs_is_mft_record(m->magic)) {
+		ntfs_debug("Mft record 0x%lx is not a FILE record, write it.",
+				mft_no);
+		return TRUE;
+	}
+	/* Write the mft record if it is a base inode. */
+	if (!m->base_mft_record) {
+		ntfs_debug("Mft record 0x%lx is a base record, write it.",
+				mft_no);
+		return TRUE;
+	}
+	/*
+	 * This is an extent mft record.  Check if the inode corresponding to
+	 * its base mft record is in icache and obtain a reference to it if it
+	 * is.
+	 */
+	na.mft_no = MREF_LE(m->base_mft_record);
+	ntfs_debug("Mft record 0x%lx is an extent record.  Looking for base "
+			"inode 0x%lx in icache.", mft_no, na.mft_no);
+	vi = ilookup5(sb, na.mft_no, (test_t)ntfs_test_inode, &na);
+	if (!vi) {
+		/*
+		 * The base inode is not in icache, write this extent mft
+		 * record.
+		 */
+		ntfs_debug("Base inode 0x%lx is not in icache, write the "
+				"extent record.", na.mft_no);
+		return TRUE;
+	}
+	ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no);
+	/*
+	 * The base inode is in icache.  Check if it has the extent inode
+	 * corresponding to this extent mft record attached.
+	 */
+	ni = NTFS_I(vi);
+	down(&ni->extent_lock);
+	if (ni->nr_extents <= 0) {
 		/*
-		 * This is an extent mft record.  Check if the inode
-		 * corresponding to its base mft record is in icache.
+		 * The base inode has no attached extent inodes, write this
+		 * extent mft record.
 		 */
-		na.mft_no = MREF_LE(m->base_mft_record);
-		ntfs_debug("Mft record 0x%lx is an extent record.  Looking "
-				"for base inode 0x%lx in icache.", mft_no,
-				na.mft_no);
-		vi = ilookup5(sb, na.mft_no, (test_t)ntfs_test_inode,
-				&na);
-		if (!vi) {
+		up(&ni->extent_lock);
+		iput(vi);
+		ntfs_debug("Base inode 0x%lx has no attached extent inodes, "
+				"write the extent record.", na.mft_no);
+		return TRUE;
+	}
+	/* Iterate over the attached extent inodes. */
+	extent_nis = ni->ext.extent_ntfs_inos;
+	for (eni = NULL, i = 0; i < ni->nr_extents; ++i) {
+		if (mft_no == extent_nis[i]->mft_no) {
 			/*
-			 * The base inode is not in icache.  Skip this extent
+			 * Found the extent inode corresponding to this extent
 			 * mft record.
 			 */
-			ntfs_debug("Base inode 0x%lx is not in icache, "
-					"continuing search.", na.mft_no);
-			continue;
+			eni = extent_nis[i];
+			break;
 		}
-		ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no);
+	}
+	/*
+	 * If the extent inode was not attached to the base inode, write this
+	 * extent mft record.
+	 */
+	if (!eni) {
+		up(&ni->extent_lock);
+		iput(vi);
+		ntfs_debug("Extent inode 0x%lx is not attached to its base "
+				"inode 0x%lx, write the extent record.",
+				mft_no, na.mft_no);
+		return TRUE;
+	}
+	ntfs_debug("Extent inode 0x%lx is attached to its base inode 0x%lx.",
+			mft_no, na.mft_no);
+	/* Take a reference to the extent ntfs inode. */
+	atomic_inc(&eni->count);
+	up(&ni->extent_lock);
+	/*
+	 * Found the extent inode coresponding to this extent mft record.
+	 * Try to take the mft record lock.
+	 */
+	if (unlikely(down_trylock(&eni->mrec_lock))) {
+		atomic_dec(&eni->count);
+		iput(vi);
+		ntfs_debug("Extent mft record 0x%lx is already locked, do "
+				"not write it.", mft_no);
+		return FALSE;
+	}
+	ntfs_debug("Managed to lock extent mft record 0x%lx, write it.",
+			mft_no);
+	if (NInoTestClearDirty(eni))
+		ntfs_debug("Extent inode 0x%lx is dirty, marking it clean.",
+				mft_no);
+	/*
+	 * The write has to occur while we hold the mft record lock so return
+	 * the locked extent ntfs inode.
+	 */
+	*locked_ni = eni;
+	return TRUE;
+}
+
+static const char *es = "  Leaving inconsistent metadata.  Unmount and run "
+		"chkdsk.";
+
+/**
+ * ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name
+ * @vol:	volume on which to search for a free mft record
+ * @base_ni:	open base inode if allocating an extent mft record or NULL
+ *
+ * Search for a free mft record in the mft bitmap attribute on the ntfs volume
+ * @vol.
+ *
+ * If @base_ni is NULL start the search at the default allocator position.
+ *
+ * If @base_ni is not NULL start the search at the mft record after the base
+ * mft record @base_ni.
+ *
+ * Return the free mft record on success and -errno on error.  An error code of
+ * -ENOSPC means that there are no free mft records in the currently
+ * initialized mft bitmap.
+ *
+ * Locking: Caller must hold vol->mftbmp_lock for writing.
+ */
+static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
+		ntfs_inode *base_ni)
+{
+	s64 pass_end, ll, data_pos, pass_start, ofs, bit;
+	struct address_space *mftbmp_mapping;
+	u8 *buf, *byte;
+	struct page *page;
+	unsigned int page_ofs, size;
+	u8 pass, b;
+
+	ntfs_debug("Searching for free mft record in the currently "
+			"initialized mft bitmap.");
+	mftbmp_mapping = vol->mftbmp_ino->i_mapping;
+	/*
+	 * Set the end of the pass making sure we do not overflow the mft
+	 * bitmap.
+	 */
+	pass_end = NTFS_I(vol->mft_ino)->allocated_size >>
+			vol->mft_record_size_bits;
+	ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3;
+	if (pass_end > ll)
+		pass_end = ll;
+	pass = 1;
+	if (!base_ni)
+		data_pos = vol->mft_data_pos;
+	else
+		data_pos = base_ni->mft_no + 1;
+	if (data_pos < 24)
+		data_pos = 24;
+	if (data_pos >= pass_end) {
+		data_pos = 24;
+		pass = 2;
+		/* This happens on a freshly formatted volume. */
+		if (data_pos >= pass_end)
+			return -ENOSPC;
+	}
+	pass_start = data_pos;
+	ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, "
+			"pass_end 0x%llx, data_pos 0x%llx.", pass,
+			(long long)pass_start, (long long)pass_end,
+			(long long)data_pos);
+	/* Loop until a free mft record is found. */
+	for (; pass <= 2;) {
+		/* Cap size to pass_end. */
+		ofs = data_pos >> 3;
+		page_ofs = ofs & ~PAGE_CACHE_MASK;
+		size = PAGE_CACHE_SIZE - page_ofs;
+		ll = ((pass_end + 7) >> 3) - ofs;
+		if (size > ll)
+			size = ll;
+		size <<= 3;
 		/*
-		 * The base inode is in icache.  Check if it has the extent
-		 * inode corresponding to this extent mft record attached.
+		 * If we are still within the active pass, search the next page
+		 * for a zero bit.
 		 */
-		ni = NTFS_I(vi);
-		down(&ni->extent_lock);
-		if (ni->nr_extents <= 0) {
+		if (size) {
+			page = ntfs_map_page(mftbmp_mapping,
+					ofs >> PAGE_CACHE_SHIFT);
+			if (unlikely(IS_ERR(page))) {
+				ntfs_error(vol->sb, "Failed to read mft "
+						"bitmap, aborting.");
+				return PTR_ERR(page);
+			}
+			buf = (u8*)page_address(page) + page_ofs;
+			bit = data_pos & 7;
+			data_pos &= ~7ull;
+			ntfs_debug("Before inner for loop: size 0x%x, "
+					"data_pos 0x%llx, bit 0x%llx", size,
+					(long long)data_pos, (long long)bit);
+			for (; bit < size && data_pos + bit < pass_end;
+					bit &= ~7ull, bit += 8) {
+				byte = buf + (bit >> 3);
+				if (*byte == 0xff)
+					continue;
+				b = ffz((unsigned long)*byte);
+				if (b < 8 && b >= (bit & 7)) {
+					ll = data_pos + (bit & ~7ull) + b;
+					if (unlikely(ll > (1ll << 32))) {
+						ntfs_unmap_page(page);
+						return -ENOSPC;
+					}
+					*byte |= 1 << b;
+					flush_dcache_page(page);
+					set_page_dirty(page);
+					ntfs_unmap_page(page);
+					ntfs_debug("Done.  (Found and "
+							"allocated mft record "
+							"0x%llx.)",
+							(long long)ll);
+					return ll;
+				}
+			}
+			ntfs_debug("After inner for loop: size 0x%x, "
+					"data_pos 0x%llx, bit 0x%llx", size,
+					(long long)data_pos, (long long)bit);
+			data_pos += size;
+			ntfs_unmap_page(page);
 			/*
-			 * The base inode has no attached extent inodes.  Skip
-			 * this extent mft record.
+			 * If the end of the pass has not been reached yet,
+			 * continue searching the mft bitmap for a zero bit.
 			 */
-			up(&ni->extent_lock);
-			iput(vi);
-			continue;
+			if (data_pos < pass_end)
+				continue;
 		}
-		/* Iterate over the attached extent inodes. */
-		extent_nis = ni->ext.extent_ntfs_inos;
-		for (eni = NULL, j = 0; j < ni->nr_extents; ++j) {
-			if (mft_no == extent_nis[j]->mft_no) {
-				/*
-				 * Found the extent inode corresponding to this
-				 * extent mft record.
-				 */
-				eni = extent_nis[j];
+		/* Do the next pass. */
+		if (++pass == 2) {
+			/*
+			 * Starting the second pass, in which we scan the first
+			 * part of the zone which we omitted earlier.
+			 */
+			pass_end = pass_start;
+			data_pos = pass_start = 24;
+			ntfs_debug("pass %i, pass_start 0x%llx, pass_end "
+					"0x%llx.", pass, (long long)pass_start,
+					(long long)pass_end);
+			if (data_pos >= pass_end)
 				break;
+		}
+	}
+	/* No free mft records in currently initialized mft bitmap. */
+	ntfs_debug("Done.  (No free mft records left in currently initialized "
+			"mft bitmap.)");
+	return -ENOSPC;
+}
+
+/**
+ * ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster
+ * @vol:	volume on which to extend the mft bitmap attribute
+ *
+ * Extend the mft bitmap attribute on the ntfs volume @vol by one cluster.
+ *
+ * Note: Only changes allocated_size, i.e. does not touch initialized_size or
+ * data_size.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: - Caller must hold vol->mftbmp_lock for writing.
+ *	    - This function takes NTFS_I(vol->mftbmp_ino)->runlist.lock for
+ *	      writing and releases it before returning.
+ *	    - This function takes vol->lcnbmp_lock for writing and releases it
+ *	      before returning.
+ */
+static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
+{
+	LCN lcn;
+	s64 ll;
+	struct page *page;
+	ntfs_inode *mft_ni, *mftbmp_ni;
+	runlist_element *rl, *rl2 = NULL;
+	ntfs_attr_search_ctx *ctx = NULL;
+	MFT_RECORD *mrec;
+	ATTR_RECORD *a = NULL;
+	int ret, mp_size;
+	u32 old_alen = 0;
+	u8 *b, tb;
+	struct {
+		u8 added_cluster:1;
+		u8 added_run:1;
+		u8 mp_rebuilt:1;
+	} status = { 0, 0, 0 };
+
+	ntfs_debug("Extending mft bitmap allocation.");
+	mft_ni = NTFS_I(vol->mft_ino);
+	mftbmp_ni = NTFS_I(vol->mftbmp_ino);
+	/*
+	 * Determine the last lcn of the mft bitmap.  The allocated size of the
+	 * mft bitmap cannot be zero so we are ok to do this.
+	 * ntfs_find_vcn() returns the runlist locked on success.
+	 */
+	rl = ntfs_find_vcn(mftbmp_ni, (mftbmp_ni->allocated_size - 1) >>
+			vol->cluster_size_bits, TRUE);
+	if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
+		ntfs_error(vol->sb, "Failed to determine last allocated "
+				"cluster of mft bitmap attribute.");
+		if (!IS_ERR(rl)) {
+			up_write(&mftbmp_ni->runlist.lock);
+			ret = -EIO;
+		} else
+			ret = PTR_ERR(rl);
+		return ret;
+	}
+	lcn = rl->lcn + rl->length;
+	ntfs_debug("Last lcn of mft bitmap attribute is 0x%llx.",
+			(long long)lcn);
+	/*
+	 * Attempt to get the cluster following the last allocated cluster by
+	 * hand as it may be in the MFT zone so the allocator would not give it
+	 * to us.
+	 */
+	ll = lcn >> 3;
+	page = ntfs_map_page(vol->lcnbmp_ino->i_mapping,
+			ll >> PAGE_CACHE_SHIFT);
+	if (IS_ERR(page)) {
+		up_write(&mftbmp_ni->runlist.lock);
+		ntfs_error(vol->sb, "Failed to read from lcn bitmap.");
+		return PTR_ERR(page);
+	}
+	b = (u8*)page_address(page) + (ll & ~PAGE_CACHE_MASK);
+	tb = 1 << (lcn & 7ull);
+	down_write(&vol->lcnbmp_lock);
+	if (*b != 0xff && !(*b & tb)) {
+		/* Next cluster is free, allocate it. */
+		*b |= tb;
+		flush_dcache_page(page);
+		set_page_dirty(page);
+		up_write(&vol->lcnbmp_lock);
+		ntfs_unmap_page(page);
+		/* Update the mft bitmap runlist. */
+		rl->length++;
+		rl[1].vcn++;
+		status.added_cluster = 1;
+		ntfs_debug("Appending one cluster to mft bitmap.");
+	} else {
+		up_write(&vol->lcnbmp_lock);
+		ntfs_unmap_page(page);
+		/* Allocate a cluster from the DATA_ZONE. */
+		rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE);
+		if (IS_ERR(rl2)) {
+			up_write(&mftbmp_ni->runlist.lock);
+			ntfs_error(vol->sb, "Failed to allocate a cluster for "
+					"the mft bitmap.");
+			return PTR_ERR(rl2);
+		}
+		rl = ntfs_runlists_merge(mftbmp_ni->runlist.rl, rl2);
+		if (IS_ERR(rl)) {
+			up_write(&mftbmp_ni->runlist.lock);
+			ntfs_error(vol->sb, "Failed to merge runlists for mft "
+					"bitmap.");
+			if (ntfs_cluster_free_from_rl(vol, rl2)) {
+				ntfs_error(vol->sb, "Failed to dealocate "
+						"allocated cluster.%s", es);
+				NVolSetErrors(vol);
 			}
+			ntfs_free(rl2);
+			return PTR_ERR(rl);
 		}
+		mftbmp_ni->runlist.rl = rl;
+		status.added_run = 1;
+		ntfs_debug("Adding one run to mft bitmap.");
+		/* Find the last run in the new runlist. */
+		for (; rl[1].length; rl++)
+			;
+	}
+	/*
+	 * Update the attribute record as well.  Note: @rl is the last
+	 * (non-terminator) runlist element of mft bitmap.
+	 */
+	mrec = map_mft_record(mft_ni);
+	if (IS_ERR(mrec)) {
+		ntfs_error(vol->sb, "Failed to map mft record.");
+		ret = PTR_ERR(mrec);
+		goto undo_alloc;
+	}
+	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
+	if (unlikely(!ctx)) {
+		ntfs_error(vol->sb, "Failed to get search context.");
+		ret = -ENOMEM;
+		goto undo_alloc;
+	}
+	ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+			mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
+			0, ctx);
+	if (unlikely(ret)) {
+		ntfs_error(vol->sb, "Failed to find last attribute extent of "
+				"mft bitmap attribute.");
+		if (ret == -ENOENT)
+			ret = -EIO;
+		goto undo_alloc;
+	}
+	a = ctx->attr;
+	ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
+	/* Search back for the previous last allocated cluster of mft bitmap. */
+	for (rl2 = rl; rl2 > mftbmp_ni->runlist.rl; rl2--) {
+		if (ll >= rl2->vcn)
+			break;
+	}
+	BUG_ON(ll < rl2->vcn);
+	BUG_ON(ll >= rl2->vcn + rl2->length);
+	/* Get the size for the new mapping pairs array for this extent. */
+	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll);
+	if (unlikely(mp_size <= 0)) {
+		ntfs_error(vol->sb, "Get size for mapping pairs failed for "
+				"mft bitmap attribute extent.");
+		ret = mp_size;
+		if (!ret)
+			ret = -EIO;
+		goto undo_alloc;
+	}
+	/* Expand the attribute record if necessary. */
+	old_alen = le32_to_cpu(a->length);
+	ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
+	if (unlikely(ret)) {
+		if (ret != -ENOSPC) {
+			ntfs_error(vol->sb, "Failed to resize attribute "
+					"record for mft bitmap attribute.");
+			goto undo_alloc;
+		}
+		// TODO: Deal with this by moving this extent to a new mft
+		// record or by starting a new extent in a new mft record or by
+		// moving other attributes out of this mft record.
+		ntfs_error(vol->sb, "Not enough space in this mft record to "
+				"accomodate extended mft bitmap attribute "
+				"extent.  Cannot handle this yet.");
+		ret = -EOPNOTSUPP;
+		goto undo_alloc;
+	}
+	status.mp_rebuilt = 1;
+	/* Generate the mapping pairs array directly into the attr record. */
+	ret = ntfs_mapping_pairs_build(vol, (u8*)a +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
+			mp_size, rl2, ll, NULL);
+	if (unlikely(ret)) {
+		ntfs_error(vol->sb, "Failed to build mapping pairs array for "
+				"mft bitmap attribute.");
+		goto undo_alloc;
+	}
+	/* Update the highest_vcn. */
+	a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
+	/*
+	 * We now have extended the mft bitmap allocated_size by one cluster.
+	 * Reflect this in the ntfs_inode structure and the attribute record.
+	 */
+	if (a->data.non_resident.lowest_vcn) {
 		/*
-		 * If the extent inode was not attached to the base inode, skip
-		 * this extent mft record.
+		 * We are not in the first attribute extent, switch to it, but
+		 * first ensure the changes will make it to disk later.
 		 */
-		if (!eni) {
-			up(&ni->extent_lock);
-			iput(vi);
-			continue;
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+		ntfs_attr_reinit_search_ctx(ctx);
+		ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+				mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL,
+				0, ctx);
+		if (unlikely(ret)) {
+			ntfs_error(vol->sb, "Failed to find first attribute "
+					"extent of mft bitmap attribute.");
+			goto restore_undo_alloc;
 		}
+		a = ctx->attr;
+	}
+	mftbmp_ni->allocated_size += vol->cluster_size;
+	a->data.non_resident.allocated_size =
+			cpu_to_sle64(mftbmp_ni->allocated_size);
+	/* Ensure the changes make it to disk. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(mft_ni);
+	up_write(&mftbmp_ni->runlist.lock);
+	ntfs_debug("Done.");
+	return 0;
+restore_undo_alloc:
+	ntfs_attr_reinit_search_ctx(ctx);
+	if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+			mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
+			0, ctx)) {
+		ntfs_error(vol->sb, "Failed to find last attribute extent of "
+				"mft bitmap attribute.%s", es);
+		mftbmp_ni->allocated_size += vol->cluster_size;
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(mft_ni);
+		up_write(&mftbmp_ni->runlist.lock);
 		/*
-		 * Found the extent inode corrsponding to this extent mft
-		 * record.  If it is dirty, no need to search further.
+		 * The only thing that is now wrong is ->allocated_size of the
+		 * base attribute extent which chkdsk should be able to fix.
 		 */
-		if (NInoDirty(eni)) {
-			up(&ni->extent_lock);
-			iput(vi);
-			is_dirty = TRUE;
+		NVolSetErrors(vol);
+		return ret;
+	}
+	a = ctx->attr;
+	a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 2);
+undo_alloc:
+	if (status.added_cluster) {
+		/* Truncate the last run in the runlist by one cluster. */
+		rl->length--;
+		rl[1].vcn--;
+	} else if (status.added_run) {
+		lcn = rl->lcn;
+		/* Remove the last run from the runlist. */
+		rl->lcn = rl[1].lcn;
+		rl->length = 0;
+	}
+	/* Deallocate the cluster. */
+	down_write(&vol->lcnbmp_lock);
+	if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
+		ntfs_error(vol->sb, "Failed to free allocated cluster.%s", es);
+		NVolSetErrors(vol);
+	}
+	up_write(&vol->lcnbmp_lock);
+	if (status.mp_rebuilt) {
+		if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset),
+				old_alen - le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset),
+				rl2, ll, NULL)) {
+			ntfs_error(vol->sb, "Failed to restore mapping pairs "
+					"array.%s", es);
+			NVolSetErrors(vol);
+		}
+		if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
+			ntfs_error(vol->sb, "Failed to restore attribute "
+					"record.%s", es);
+			NVolSetErrors(vol);
+		}
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+	}
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (!IS_ERR(mrec))
+		unmap_mft_record(mft_ni);
+	up_write(&mftbmp_ni->runlist.lock);
+	return ret;
+}
+
+/**
+ * ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data
+ * @vol:	volume on which to extend the mft bitmap attribute
+ *
+ * Extend the initialized portion of the mft bitmap attribute on the ntfs
+ * volume @vol by 8 bytes.
+ *
+ * Note:  Only changes initialized_size and data_size, i.e. requires that
+ * allocated_size is big enough to fit the new initialized_size.
+ *
+ * Return 0 on success and -error on error.
+ *
+ * Locking: Caller must hold vol->mftbmp_lock for writing.
+ */
+static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol)
+{
+	s64 old_data_size, old_initialized_size;
+	struct inode *mftbmp_vi;
+	ntfs_inode *mft_ni, *mftbmp_ni;
+	ntfs_attr_search_ctx *ctx;
+	MFT_RECORD *mrec;
+	ATTR_RECORD *a;
+	int ret;
+
+	ntfs_debug("Extending mft bitmap initiailized (and data) size.");
+	mft_ni = NTFS_I(vol->mft_ino);
+	mftbmp_vi = vol->mftbmp_ino;
+	mftbmp_ni = NTFS_I(mftbmp_vi);
+	/* Get the attribute record. */
+	mrec = map_mft_record(mft_ni);
+	if (IS_ERR(mrec)) {
+		ntfs_error(vol->sb, "Failed to map mft record.");
+		return PTR_ERR(mrec);
+	}
+	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
+	if (unlikely(!ctx)) {
+		ntfs_error(vol->sb, "Failed to get search context.");
+		ret = -ENOMEM;
+		goto unm_err_out;
+	}
+	ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+			mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(ret)) {
+		ntfs_error(vol->sb, "Failed to find first attribute extent of "
+				"mft bitmap attribute.");
+		if (ret == -ENOENT)
+			ret = -EIO;
+		goto put_err_out;
+	}
+	a = ctx->attr;
+	old_data_size = mftbmp_vi->i_size;
+	old_initialized_size = mftbmp_ni->initialized_size;
+	/*
+	 * We can simply update the initialized_size before filling the space
+	 * with zeroes because the caller is holding the mft bitmap lock for
+	 * writing which ensures that no one else is trying to access the data.
+	 */
+	mftbmp_ni->initialized_size += 8;
+	a->data.non_resident.initialized_size =
+			cpu_to_sle64(mftbmp_ni->initialized_size);
+	if (mftbmp_ni->initialized_size > mftbmp_vi->i_size) {
+		mftbmp_vi->i_size = mftbmp_ni->initialized_size;
+		a->data.non_resident.data_size =
+				cpu_to_sle64(mftbmp_vi->i_size);
+	}
+	/* Ensure the changes make it to disk. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(mft_ni);
+	/* Initialize the mft bitmap attribute value with zeroes. */
+	ret = ntfs_attr_set(mftbmp_ni, old_initialized_size, 8, 0);
+	if (likely(!ret)) {
+		ntfs_debug("Done.  (Wrote eight initialized bytes to mft "
+				"bitmap.");
+		return 0;
+	}
+	ntfs_error(vol->sb, "Failed to write to mft bitmap.");
+	/* Try to recover from the error. */
+	mrec = map_mft_record(mft_ni);
+	if (IS_ERR(mrec)) {
+		ntfs_error(vol->sb, "Failed to map mft record.%s", es);
+		NVolSetErrors(vol);
+		return ret;
+	}
+	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
+	if (unlikely(!ctx)) {
+		ntfs_error(vol->sb, "Failed to get search context.%s", es);
+		NVolSetErrors(vol);
+		goto unm_err_out;
+	}
+	if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+			mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx)) {
+		ntfs_error(vol->sb, "Failed to find first attribute extent of "
+				"mft bitmap attribute.%s", es);
+		NVolSetErrors(vol);
+put_err_out:
+		ntfs_attr_put_search_ctx(ctx);
+unm_err_out:
+		unmap_mft_record(mft_ni);
+		goto err_out;
+	}
+	a = ctx->attr;
+	mftbmp_ni->initialized_size = old_initialized_size;
+	a->data.non_resident.initialized_size =
+			cpu_to_sle64(old_initialized_size);
+	if (mftbmp_vi->i_size != old_data_size) {
+		mftbmp_vi->i_size = old_data_size;
+		a->data.non_resident.data_size = cpu_to_sle64(old_data_size);
+	}
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(mft_ni);
+	ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, "
+			"data_size 0x%llx, initialized_size 0x%llx.",
+			(long long)mftbmp_ni->allocated_size,
+			(long long)mftbmp_vi->i_size,
+			(long long)mftbmp_ni->initialized_size);
+err_out:
+	return ret;
+}
+
+/**
+ * ntfs_mft_data_extend_allocation_nolock - extend mft data attribute
+ * @vol:	volume on which to extend the mft data attribute
+ *
+ * Extend the mft data attribute on the ntfs volume @vol by 16 mft records
+ * worth of clusters or if not enough space for this by one mft record worth
+ * of clusters.
+ *
+ * Note:  Only changes allocated_size, i.e. does not touch initialized_size or
+ * data_size.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: - Caller must hold vol->mftbmp_lock for writing.
+ *	    - This function takes NTFS_I(vol->mft_ino)->runlist.lock for
+ *	      writing and releases it before returning.
+ *	    - This function calls functions which take vol->lcnbmp_lock for
+ *	      writing and release it before returning.
+ */
+static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
+{
+	LCN lcn;
+	VCN old_last_vcn;
+	s64 min_nr, nr, ll = 0;
+	ntfs_inode *mft_ni;
+	runlist_element *rl, *rl2;
+	ntfs_attr_search_ctx *ctx = NULL;
+	MFT_RECORD *mrec;
+	ATTR_RECORD *a = NULL;
+	int ret, mp_size;
+	u32 old_alen = 0;
+	BOOL mp_rebuilt = FALSE;
+
+	ntfs_debug("Extending mft data allocation.");
+	mft_ni = NTFS_I(vol->mft_ino);
+	/*
+	 * Determine the preferred allocation location, i.e. the last lcn of
+	 * the mft data attribute.  The allocated size of the mft data
+	 * attribute cannot be zero so we are ok to do this.
+	 * ntfs_find_vcn() returns the runlist locked on success.
+	 */
+	rl = ntfs_find_vcn(mft_ni, (mft_ni->allocated_size - 1) >>
+			vol->cluster_size_bits, TRUE);
+	if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
+		ntfs_error(vol->sb, "Failed to determine last allocated "
+				"cluster of mft data attribute.");
+		if (!IS_ERR(rl)) {
+			up_write(&mft_ni->runlist.lock);
+			ret = -EIO;
+		} else
+			ret = PTR_ERR(rl);
+		return ret;
+	}
+	lcn = rl->lcn + rl->length;
+	ntfs_debug("Last lcn of mft data attribute is 0x%llx.",
+			(long long)lcn);
+	/* Minimum allocation is one mft record worth of clusters. */
+	min_nr = vol->mft_record_size >> vol->cluster_size_bits;
+	if (!min_nr)
+		min_nr = 1;
+	/* Want to allocate 16 mft records worth of clusters. */
+	nr = vol->mft_record_size << 4 >> vol->cluster_size_bits;
+	if (!nr)
+		nr = min_nr;
+	/* Ensure we do not go above 2^32-1 mft records. */
+	if (unlikely((mft_ni->allocated_size +
+			(nr << vol->cluster_size_bits)) >>
+			vol->mft_record_size_bits >= (1ll << 32))) {
+		nr = min_nr;
+		if (unlikely((mft_ni->allocated_size +
+				(nr << vol->cluster_size_bits)) >>
+				vol->mft_record_size_bits >= (1ll << 32))) {
+			ntfs_warning(vol->sb, "Cannot allocate mft record "
+					"because the maximum number of inodes "
+					"(2^32) has already been reached.");
+			up_write(&mft_ni->runlist.lock);
+			return -ENOSPC;
+		}
+	}
+	ntfs_debug("Trying mft data allocation with %s cluster count %lli.",
+			nr > min_nr ? "default" : "minimal", (long long)nr);
+	old_last_vcn = rl[1].vcn;
+	do {
+		rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE);
+		if (likely(!IS_ERR(rl2)))
 			break;
+		if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) {
+			ntfs_error(vol->sb, "Failed to allocate the minimal "
+					"number of clusters (%lli) for the "
+					"mft data attribute.", (long long)nr);
+			up_write(&mft_ni->runlist.lock);
+			return PTR_ERR(rl2);
 		}
-		/* The extent inode is not dirty, so do the next record. */
-		up(&ni->extent_lock);
-		iput(vi);
+		/*
+		 * There is not enough space to do the allocation, but there
+		 * might be enough space to do a minimal allocation so try that
+		 * before failing.
+		 */
+		nr = min_nr;
+		ntfs_debug("Retrying mft data allocation with minimal cluster "
+				"count %lli.", (long long)nr);
+	} while (1);
+	rl = ntfs_runlists_merge(mft_ni->runlist.rl, rl2);
+	if (IS_ERR(rl)) {
+		up_write(&mft_ni->runlist.lock);
+		ntfs_error(vol->sb, "Failed to merge runlists for mft data "
+				"attribute.");
+		if (ntfs_cluster_free_from_rl(vol, rl2)) {
+			ntfs_error(vol->sb, "Failed to dealocate clusters "
+					"from the mft data attribute.%s", es);
+			NVolSetErrors(vol);
+		}
+		ntfs_free(rl2);
+		return PTR_ERR(rl);
 	}
-	kunmap(page);
-	/* If a dirty mft record was found, redirty the page. */
-	if (is_dirty) {
-		ntfs_debug("Inode 0x%lx is dirty.  Redirtying the page "
-				"starting at inode 0x%lx.", mft_no,
-				page->index << (PAGE_CACHE_SHIFT -
-				vol->mft_record_size_bits));
-		redirty_page_for_writepage(wbc, page);
-		unlock_page(page);
-	} else {
+	mft_ni->runlist.rl = rl;
+	ntfs_debug("Allocated %lli clusters.", nr);
+	/* Find the last run in the new runlist. */
+	for (; rl[1].length; rl++)
+		;
+	/* Update the attribute record as well. */
+	mrec = map_mft_record(mft_ni);
+	if (IS_ERR(mrec)) {
+		ntfs_error(vol->sb, "Failed to map mft record.");
+		ret = PTR_ERR(mrec);
+		goto undo_alloc;
+	}
+	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
+	if (unlikely(!ctx)) {
+		ntfs_error(vol->sb, "Failed to get search context.");
+		ret = -ENOMEM;
+		goto undo_alloc;
+	}
+	ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
+			CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx);
+	if (unlikely(ret)) {
+		ntfs_error(vol->sb, "Failed to find last attribute extent of "
+				"mft data attribute.");
+		if (ret == -ENOENT)
+			ret = -EIO;
+		goto undo_alloc;
+	}
+	a = ctx->attr;
+	ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
+	/* Search back for the previous last allocated cluster of mft bitmap. */
+	for (rl2 = rl; rl2 > mft_ni->runlist.rl; rl2--) {
+		if (ll >= rl2->vcn)
+			break;
+	}
+	BUG_ON(ll < rl2->vcn);
+	BUG_ON(ll >= rl2->vcn + rl2->length);
+	/* Get the size for the new mapping pairs array for this extent. */
+	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll);
+	if (unlikely(mp_size <= 0)) {
+		ntfs_error(vol->sb, "Get size for mapping pairs failed for "
+				"mft data attribute extent.");
+		ret = mp_size;
+		if (!ret)
+			ret = -EIO;
+		goto undo_alloc;
+	}
+	/* Expand the attribute record if necessary. */
+	old_alen = le32_to_cpu(a->length);
+	ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
+	if (unlikely(ret)) {
+		if (ret != -ENOSPC) {
+			ntfs_error(vol->sb, "Failed to resize attribute "
+					"record for mft data attribute.");
+			goto undo_alloc;
+		}
+		// TODO: Deal with this by moving this extent to a new mft
+		// record or by starting a new extent in a new mft record or by
+		// moving other attributes out of this mft record.
+		// Note: Use the special reserved mft records and ensure that
+		// this extent is not required to find the mft record in
+		// question.
+		ntfs_error(vol->sb, "Not enough space in this mft record to "
+				"accomodate extended mft data attribute "
+				"extent.  Cannot handle this yet.");
+		ret = -EOPNOTSUPP;
+		goto undo_alloc;
+	}
+	mp_rebuilt = TRUE;
+	/* Generate the mapping pairs array directly into the attr record. */
+	ret = ntfs_mapping_pairs_build(vol, (u8*)a +
+			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
+			mp_size, rl2, ll, NULL);
+	if (unlikely(ret)) {
+		ntfs_error(vol->sb, "Failed to build mapping pairs array of "
+				"mft data attribute.");
+		goto undo_alloc;
+	}
+	/* Update the highest_vcn. */
+	a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
+	/*
+	 * We now have extended the mft data allocated_size by nr clusters.
+	 * Reflect this in the ntfs_inode structure and the attribute record.
+	 * @rl is the last (non-terminator) runlist element of mft data
+	 * attribute.
+	 */
+	if (a->data.non_resident.lowest_vcn) {
+		/*
+		 * We are not in the first attribute extent, switch to it, but
+		 * first ensure the changes will make it to disk later.
+		 */
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+		ntfs_attr_reinit_search_ctx(ctx);
+		ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name,
+				mft_ni->name_len, CASE_SENSITIVE, 0, NULL, 0,
+				ctx);
+		if (unlikely(ret)) {
+			ntfs_error(vol->sb, "Failed to find first attribute "
+					"extent of mft data attribute.");
+			goto restore_undo_alloc;
+		}
+		a = ctx->attr;
+	}
+	mft_ni->allocated_size += nr << vol->cluster_size_bits;
+	a->data.non_resident.allocated_size =
+			cpu_to_sle64(mft_ni->allocated_size);
+	/* Ensure the changes make it to disk. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(mft_ni);
+	up_write(&mft_ni->runlist.lock);
+	ntfs_debug("Done.");
+	return 0;
+restore_undo_alloc:
+	ntfs_attr_reinit_search_ctx(ctx);
+	if (ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
+			CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) {
+		ntfs_error(vol->sb, "Failed to find last attribute extent of "
+				"mft data attribute.%s", es);
+		mft_ni->allocated_size += nr << vol->cluster_size_bits;
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(mft_ni);
+		up_write(&mft_ni->runlist.lock);
+		/*
+		 * The only thing that is now wrong is ->allocated_size of the
+		 * base attribute extent which chkdsk should be able to fix.
+		 */
+		NVolSetErrors(vol);
+		return ret;
+	}
+	a = ctx->attr;
+	a->data.non_resident.highest_vcn = cpu_to_sle64(old_last_vcn - 1);
+undo_alloc:
+	if (ntfs_cluster_free(vol->mft_ino, old_last_vcn, -1) < 0) {
+		ntfs_error(vol->sb, "Failed to free clusters from mft data "
+				"attribute.%s", es);
+		NVolSetErrors(vol);
+	}
+	if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) {
+		ntfs_error(vol->sb, "Failed to truncate mft data attribute "
+				"runlist.%s", es);
+		NVolSetErrors(vol);
+	}
+	if (mp_rebuilt) {
+		if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset),
+				old_alen - le16_to_cpu(
+				a->data.non_resident.mapping_pairs_offset),
+				rl2, ll, NULL)) {
+			ntfs_error(vol->sb, "Failed to restore mapping pairs "
+					"array.%s", es);
+			NVolSetErrors(vol);
+		}
+		if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
+			ntfs_error(vol->sb, "Failed to restore attribute "
+					"record.%s", es);
+			NVolSetErrors(vol);
+		}
+		flush_dcache_mft_record_page(ctx->ntfs_ino);
+		mark_mft_record_dirty(ctx->ntfs_ino);
+	}
+	if (ctx)
+		ntfs_attr_put_search_ctx(ctx);
+	if (!IS_ERR(mrec))
+		unmap_mft_record(mft_ni);
+	up_write(&mft_ni->runlist.lock);
+	return ret;
+}
+
+/**
+ * ntfs_mft_record_layout - layout an mft record into a memory buffer
+ * @vol:	volume to which the mft record will belong
+ * @mft_no:	mft reference specifying the mft record number
+ * @m:		destination buffer of size >= @vol->mft_record_size bytes
+ *
+ * Layout an empty, unused mft record with the mft record number @mft_no into
+ * the buffer @m.  The volume @vol is needed because the mft record structure
+ * was modified in NTFS 3.1 so we need to know which volume version this mft
+ * record will be used on.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no,
+		MFT_RECORD *m)
+{
+	ATTR_RECORD *a;
+
+	ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
+	if (mft_no >= (1ll << 32)) {
+		ntfs_error(vol->sb, "Mft record number 0x%llx exceeds "
+				"maximum of 2^32.", (long long)mft_no);
+		return -ERANGE;
+	}
+	/* Start by clearing the whole mft record to gives us a clean slate. */
+	memset(m, 0, vol->mft_record_size);
+	/* Aligned to 2-byte boundary. */
+	if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver))
+		m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD_OLD) + 1) & ~1);
+	else {
+		m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1);
 		/*
-		 * Keep the VM happy.  This must be done otherwise the
-		 * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though
-		 * the page is clean.
+		 * Set the NTFS 3.1+ specific fields while we know that the
+		 * volume version is 3.1+.
 		 */
-		BUG_ON(PageWriteback(page));
-		set_page_writeback(page);
+		m->reserved = 0;
+		m->mft_record_number = cpu_to_le32((u32)mft_no);
+	}
+	m->magic = magic_FILE;
+	if (vol->mft_record_size >= NTFS_BLOCK_SIZE)
+		m->usa_count = cpu_to_le16(vol->mft_record_size /
+				NTFS_BLOCK_SIZE + 1);
+	else {
+		m->usa_count = cpu_to_le16(1);
+		ntfs_warning(vol->sb, "Sector size is bigger than mft record "
+				"size.  Setting usa_count to 1.  If chkdsk "
+				"reports this as corruption, please email "
+				"linux-ntfs-dev@lists.sourceforge.net stating "
+				"that you saw this message and that the "
+				"modified file system created was corrupt.  "
+				"Thank you.");
+	}
+	/* Set the update sequence number to 1. */
+	*(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1);
+	m->lsn = 0;
+	m->sequence_number = cpu_to_le16(1);
+	m->link_count = 0;
+	/*
+	 * Place the attributes straight after the update sequence array,
+	 * aligned to 8-byte boundary.
+	 */
+	m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) +
+			(le16_to_cpu(m->usa_count) << 1) + 7) & ~7);
+	m->flags = 0;
+	/*
+	 * Using attrs_offset plus eight bytes (for the termination attribute).
+	 * attrs_offset is already aligned to 8-byte boundary, so no need to
+	 * align again.
+	 */
+	m->bytes_in_use = cpu_to_le32(le16_to_cpu(m->attrs_offset) + 8);
+	m->bytes_allocated = cpu_to_le32(vol->mft_record_size);
+	m->base_mft_record = 0;
+	m->next_attr_instance = 0;
+	/* Add the termination attribute. */
+	a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset));
+	a->type = AT_END;
+	a->length = 0;
+	ntfs_debug("Done.");
+	return 0;
+}
+
+/**
+ * ntfs_mft_record_format - format an mft record on an ntfs volume
+ * @vol:	volume on which to format the mft record
+ * @mft_no:	mft record number to format
+ *
+ * Format the mft record @mft_no in $MFT/$DATA, i.e. lay out an empty, unused
+ * mft record into the appropriate place of the mft data attribute.  This is
+ * used when extending the mft data attribute.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no)
+{
+	struct inode *mft_vi = vol->mft_ino;
+	struct page *page;
+	MFT_RECORD *m;
+	pgoff_t index, end_index;
+	unsigned int ofs;
+	int err;
+
+	ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
+	/*
+	 * The index into the page cache and the offset within the page cache
+	 * page of the wanted mft record.
+	 */
+	index = mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
+	ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+	/* The maximum valid index into the page cache for $MFT's data. */
+	end_index = mft_vi->i_size >> PAGE_CACHE_SHIFT;
+	if (unlikely(index >= end_index)) {
+		if (unlikely(index > end_index || ofs + vol->mft_record_size >=
+				(mft_vi->i_size & ~PAGE_CACHE_MASK))) {
+			ntfs_error(vol->sb, "Tried to format non-existing mft "
+					"record 0x%llx.", (long long)mft_no);
+			return -ENOENT;
+		}
+	}
+	/* Read, map, and pin the page containing the mft record. */
+	page = ntfs_map_page(mft_vi->i_mapping, index);
+	if (unlikely(IS_ERR(page))) {
+		ntfs_error(vol->sb, "Failed to map page containing mft record "
+				"to format 0x%llx.", (long long)mft_no);
+		return PTR_ERR(page);
+	}
+	lock_page(page);
+	BUG_ON(!PageUptodate(page));
+	ClearPageUptodate(page);
+	m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
+	err = ntfs_mft_record_layout(vol, mft_no, m);
+	if (unlikely(err)) {
+		ntfs_error(vol->sb, "Failed to layout mft record 0x%llx.",
+				(long long)mft_no);
+		SetPageUptodate(page);
 		unlock_page(page);
-		end_page_writeback(page);
+		ntfs_unmap_page(page);
+		return err;
 	}
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	unlock_page(page);
+	/*
+	 * Make sure the mft record is written out to disk.  We could use
+	 * ilookup5() to check if an inode is in icache and so on but this is
+	 * unnecessary as ntfs_writepage() will write the dirty record anyway.
+	 */
+	mark_ntfs_record_dirty(page, ofs);
+	ntfs_unmap_page(page);
 	ntfs_debug("Done.");
 	return 0;
 }
 
+/**
+ * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume
+ * @vol:	[IN]  volume on which to allocate the mft record
+ * @mode:	[IN]  mode if want a file or directory, i.e. base inode or 0
+ * @base_ni:	[IN]  open base inode if allocating an extent mft record or NULL
+ * @mrec:	[OUT] on successful return this is the mapped mft record
+ *
+ * Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol.
+ *
+ * If @base_ni is NULL make the mft record a base mft record, i.e. a file or
+ * direvctory inode, and allocate it at the default allocator position.  In
+ * this case @mode is the file mode as given to us by the caller.  We in
+ * particular use @mode to distinguish whether a file or a directory is being
+ * created (S_IFDIR(mode) and S_IFREG(mode), respectively).
+ *
+ * If @base_ni is not NULL make the allocated mft record an extent record,
+ * allocate it starting at the mft record after the base mft record and attach
+ * the allocated and opened ntfs inode to the base inode @base_ni.  In this
+ * case @mode must be 0 as it is meaningless for extent inodes.
+ *
+ * You need to check the return value with IS_ERR().  If false, the function
+ * was successful and the return value is the now opened ntfs inode of the
+ * allocated mft record.  *@mrec is then set to the allocated, mapped, pinned,
+ * and locked mft record.  If IS_ERR() is true, the function failed and the
+ * error code is obtained from PTR_ERR(return value).  *@mrec is undefined in
+ * this case.
+ *
+ * Allocation strategy:
+ *
+ * To find a free mft record, we scan the mft bitmap for a zero bit.  To
+ * optimize this we start scanning at the place specified by @base_ni or if
+ * @base_ni is NULL we start where we last stopped and we perform wrap around
+ * when we reach the end.  Note, we do not try to allocate mft records below
+ * number 24 because numbers 0 to 15 are the defined system files anyway and 16
+ * to 24 are special in that they are used for storing extension mft records
+ * for the $DATA attribute of $MFT.  This is required to avoid the possibility
+ * of creating a runlist with a circular dependency which once written to disk
+ * can never be read in again.  Windows will only use records 16 to 24 for
+ * normal files if the volume is completely out of space.  We never use them
+ * which means that when the volume is really out of space we cannot create any
+ * more files while Windows can still create up to 8 small files.  We can start
+ * doing this at some later time, it does not matter much for now.
+ *
+ * When scanning the mft bitmap, we only search up to the last allocated mft
+ * record.  If there are no free records left in the range 24 to number of
+ * allocated mft records, then we extend the $MFT/$DATA attribute in order to
+ * create free mft records.  We extend the allocated size of $MFT/$DATA by 16
+ * records at a time or one cluster, if cluster size is above 16kiB.  If there
+ * is not sufficient space to do this, we try to extend by a single mft record
+ * or one cluster, if cluster size is above the mft record size.
+ *
+ * No matter how many mft records we allocate, we initialize only the first
+ * allocated mft record, incrementing mft data size and initialized size
+ * accordingly, open an ntfs_inode for it and return it to the caller, unless
+ * there are less than 24 mft records, in which case we allocate and initialize
+ * mft records until we reach record 24 which we consider as the first free mft
+ * record for use by normal files.
+ *
+ * If during any stage we overflow the initialized data in the mft bitmap, we
+ * extend the initialized size (and data size) by 8 bytes, allocating another
+ * cluster if required.  The bitmap data size has to be at least equal to the
+ * number of mft records in the mft, but it can be bigger, in which case the
+ * superflous bits are padded with zeroes.
+ *
+ * Thus, when we return successfully (IS_ERR() is false), we will have:
+ *	- initialized / extended the mft bitmap if necessary,
+ *	- initialized / extended the mft data if necessary,
+ *	- set the bit corresponding to the mft record being allocated in the
+ *	  mft bitmap,
+ *	- opened an ntfs_inode for the allocated mft record, and we will have
+ *	- returned the ntfs_inode as well as the allocated mapped, pinned, and
+ *	  locked mft record.
+ *
+ * On error, the volume will be left in a consistent state and no record will
+ * be allocated.  If rolling back a partial operation fails, we may leave some
+ * inconsistent metadata in which case we set NVolErrors() so the volume is
+ * left dirty when unmounted.
+ *
+ * Note, this function cannot make use of most of the normal functions, like
+ * for example for attribute resizing, etc, because when the run list overflows
+ * the base mft record and an attribute list is used, it is very important that
+ * the extension mft records used to store the $DATA attribute of $MFT can be
+ * reached without having to read the information contained inside them, as
+ * this would make it impossible to find them in the first place after the
+ * volume is unmounted.  $MFT/$BITMAP probably does not need to follow this
+ * rule because the bitmap is not essential for finding the mft records, but on
+ * the other hand, handling the bitmap in this special way would make life
+ * easier because otherwise there might be circular invocations of functions
+ * when reading the bitmap.
+ */
+ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
+		ntfs_inode *base_ni, MFT_RECORD **mrec)
+{
+	s64 ll, bit, old_data_initialized, old_data_size;
+	struct inode *vi;
+	struct page *page;
+	ntfs_inode *mft_ni, *mftbmp_ni, *ni;
+	ntfs_attr_search_ctx *ctx;
+	MFT_RECORD *m;
+	ATTR_RECORD *a;
+	pgoff_t index;
+	unsigned int ofs;
+	int err;
+	le16 seq_no, usn;
+	BOOL record_formatted = FALSE;
+
+	if (base_ni) {
+		ntfs_debug("Entering (allocating an extent mft record for "
+				"base mft record 0x%llx).",
+				(long long)base_ni->mft_no);
+		/* @mode and @base_ni are mutually exclusive. */
+		BUG_ON(mode);
+	} else
+		ntfs_debug("Entering (allocating a base mft record).");
+	if (mode) {
+		/* @mode and @base_ni are mutually exclusive. */
+		BUG_ON(base_ni);
+		/* We only support creation of normal files and directories. */
+		if (!S_ISREG(mode) && !S_ISDIR(mode))
+			return ERR_PTR(-EOPNOTSUPP);
+	}
+	BUG_ON(!mrec);
+	mft_ni = NTFS_I(vol->mft_ino);
+	mftbmp_ni = NTFS_I(vol->mftbmp_ino);
+	down_write(&vol->mftbmp_lock);
+	bit = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol, base_ni);
+	if (bit >= 0) {
+		ntfs_debug("Found and allocated free record (#1), bit 0x%llx.",
+				(long long)bit);
+		goto have_alloc_rec;
+	}
+	if (bit != -ENOSPC) {
+		up_write(&vol->mftbmp_lock);
+		return ERR_PTR(bit);
+	}
+	/*
+	 * No free mft records left.  If the mft bitmap already covers more
+	 * than the currently used mft records, the next records are all free,
+	 * so we can simply allocate the first unused mft record.
+	 * Note: We also have to make sure that the mft bitmap at least covers
+	 * the first 24 mft records as they are special and whilst they may not
+	 * be in use, we do not allocate from them.
+	 */
+	ll = mft_ni->initialized_size >> vol->mft_record_size_bits;
+	if (mftbmp_ni->initialized_size << 3 > ll &&
+			mftbmp_ni->initialized_size > 3) {
+		bit = ll;
+		if (bit < 24)
+			bit = 24;
+		if (unlikely(bit >= (1ll << 32)))
+			goto max_err_out;
+		ntfs_debug("Found free record (#2), bit 0x%llx.",
+				(long long)bit);
+		goto found_free_rec;
+	}
+	/*
+	 * The mft bitmap needs to be expanded until it covers the first unused
+	 * mft record that we can allocate.
+	 * Note: The smallest mft record we allocate is mft record 24.
+	 */
+	bit = mftbmp_ni->initialized_size << 3;
+	if (unlikely(bit >= (1ll << 32)))
+		goto max_err_out;
+	ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, "
+			"data_size 0x%llx, initialized_size 0x%llx.",
+			(long long)mftbmp_ni->allocated_size,
+			(long long)vol->mftbmp_ino->i_size,
+			(long long)mftbmp_ni->initialized_size);
+	if (mftbmp_ni->initialized_size + 8 > mftbmp_ni->allocated_size) {
+		/* Need to extend bitmap by one more cluster. */
+		ntfs_debug("mftbmp: initialized_size + 8 > allocated_size.");
+		err = ntfs_mft_bitmap_extend_allocation_nolock(vol);
+		if (unlikely(err)) {
+			up_write(&vol->mftbmp_lock);
+			goto err_out;
+		}
+		ntfs_debug("Status of mftbmp after allocation extension: "
+				"allocated_size 0x%llx, data_size 0x%llx, "
+				"initialized_size 0x%llx.",
+				(long long)mftbmp_ni->allocated_size,
+				(long long)vol->mftbmp_ino->i_size,
+				(long long)mftbmp_ni->initialized_size);
+	}
+	/*
+	 * We now have sufficient allocated space, extend the initialized_size
+	 * as well as the data_size if necessary and fill the new space with
+	 * zeroes.
+	 */
+	err = ntfs_mft_bitmap_extend_initialized_nolock(vol);
+	if (unlikely(err)) {
+		up_write(&vol->mftbmp_lock);
+		goto err_out;
+	}
+	ntfs_debug("Status of mftbmp after initialized extention: "
+			"allocated_size 0x%llx, data_size 0x%llx, "
+			"initialized_size 0x%llx.",
+			(long long)mftbmp_ni->allocated_size,
+			(long long)vol->mftbmp_ino->i_size,
+			(long long)mftbmp_ni->initialized_size);
+	ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit);
+found_free_rec:
+	/* @bit is the found free mft record, allocate it in the mft bitmap. */
+	ntfs_debug("At found_free_rec.");
+	err = ntfs_bitmap_set_bit(vol->mftbmp_ino, bit);
+	if (unlikely(err)) {
+		ntfs_error(vol->sb, "Failed to allocate bit in mft bitmap.");
+		up_write(&vol->mftbmp_lock);
+		goto err_out;
+	}
+	ntfs_debug("Set bit 0x%llx in mft bitmap.", (long long)bit);
+have_alloc_rec:
+	/*
+	 * The mft bitmap is now uptodate.  Deal with mft data attribute now.
+	 * Note, we keep hold of the mft bitmap lock for writing until all
+	 * modifications to the mft data attribute are complete, too, as they
+	 * will impact decisions for mft bitmap and mft record allocation done
+	 * by a parallel allocation and if the lock is not maintained a
+	 * parallel allocation could allocate the same mft record as this one.
+	 */
+	ll = (bit + 1) << vol->mft_record_size_bits;
+	if (ll <= mft_ni->initialized_size) {
+		ntfs_debug("Allocated mft record already initialized.");
+		goto mft_rec_already_initialized;
+	}
+	ntfs_debug("Initializing allocated mft record.");
+	/*
+	 * The mft record is outside the initialized data.  Extend the mft data
+	 * attribute until it covers the allocated record.  The loop is only
+	 * actually traversed more than once when a freshly formatted volume is
+	 * first written to so it optimizes away nicely in the common case.
+	 */
+	ntfs_debug("Status of mft data before extension: "
+			"allocated_size 0x%llx, data_size 0x%llx, "
+			"initialized_size 0x%llx.",
+			(long long)mft_ni->allocated_size,
+			(long long)vol->mft_ino->i_size,
+			(long long)mft_ni->initialized_size);
+	while (ll > mft_ni->allocated_size) {
+		err = ntfs_mft_data_extend_allocation_nolock(vol);
+		if (unlikely(err)) {
+			ntfs_error(vol->sb, "Failed to extend mft data "
+					"allocation.");
+			goto undo_mftbmp_alloc_nolock;
+		}
+		ntfs_debug("Status of mft data after allocation extension: "
+				"allocated_size 0x%llx, data_size 0x%llx, "
+				"initialized_size 0x%llx.",
+				(long long)mft_ni->allocated_size,
+				(long long)vol->mft_ino->i_size,
+				(long long)mft_ni->initialized_size);
+	}
+	/*
+	 * Extend mft data initialized size (and data size of course) to reach
+	 * the allocated mft record, formatting the mft records allong the way.
+	 * Note: We only modify the ntfs_inode structure as that is all that is
+	 * needed by ntfs_mft_record_format().  We will update the attribute
+	 * record itself in one fell swoop later on.
+	 */
+	old_data_initialized = mft_ni->initialized_size;
+	old_data_size = vol->mft_ino->i_size;
+	while (ll > mft_ni->initialized_size) {
+		s64 new_initialized_size, mft_no;
+		
+		new_initialized_size = mft_ni->initialized_size +
+				vol->mft_record_size;
+		mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits;
+		if (new_initialized_size > vol->mft_ino->i_size)
+			vol->mft_ino->i_size = new_initialized_size;
+		ntfs_debug("Initializing mft record 0x%llx.",
+				(long long)mft_no);
+		err = ntfs_mft_record_format(vol, mft_no);
+		if (unlikely(err)) {
+			ntfs_error(vol->sb, "Failed to format mft record.");
+			goto undo_data_init;
+		}
+		mft_ni->initialized_size = new_initialized_size;
+	}
+	record_formatted = TRUE;
+	/* Update the mft data attribute record to reflect the new sizes. */
+	m = map_mft_record(mft_ni);
+	if (IS_ERR(m)) {
+		ntfs_error(vol->sb, "Failed to map mft record.");
+		err = PTR_ERR(m);
+		goto undo_data_init;
+	}
+	ctx = ntfs_attr_get_search_ctx(mft_ni, m);
+	if (unlikely(!ctx)) {
+		ntfs_error(vol->sb, "Failed to get search context.");
+		err = -ENOMEM;
+		unmap_mft_record(mft_ni);
+		goto undo_data_init;
+	}
+	err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
+			CASE_SENSITIVE, 0, NULL, 0, ctx);
+	if (unlikely(err)) {
+		ntfs_error(vol->sb, "Failed to find first attribute extent of "
+				"mft data attribute.");
+		ntfs_attr_put_search_ctx(ctx);
+		unmap_mft_record(mft_ni);
+		goto undo_data_init;
+	}
+	a = ctx->attr;
+	a->data.non_resident.initialized_size =
+			cpu_to_sle64(mft_ni->initialized_size);
+	a->data.non_resident.data_size = cpu_to_sle64(vol->mft_ino->i_size);
+	/* Ensure the changes make it to disk. */
+	flush_dcache_mft_record_page(ctx->ntfs_ino);
+	mark_mft_record_dirty(ctx->ntfs_ino);
+	ntfs_attr_put_search_ctx(ctx);
+	unmap_mft_record(mft_ni);
+	ntfs_debug("Status of mft data after mft record initialization: "
+			"allocated_size 0x%llx, data_size 0x%llx, "
+			"initialized_size 0x%llx.",
+			(long long)mft_ni->allocated_size,
+			(long long)vol->mft_ino->i_size,
+			(long long)mft_ni->initialized_size);
+	BUG_ON(vol->mft_ino->i_size > mft_ni->allocated_size);
+	BUG_ON(mft_ni->initialized_size > vol->mft_ino->i_size);
+mft_rec_already_initialized:
+	/*
+	 * We can finally drop the mft bitmap lock as the mft data attribute
+	 * has been fully updated.  The only disparity left is that the
+	 * allocated mft record still needs to be marked as in use to match the
+	 * set bit in the mft bitmap but this is actually not a problem since
+	 * this mft record is not referenced from anywhere yet and the fact
+	 * that it is allocated in the mft bitmap means that no-one will try to
+	 * allocate it either.
+	 */
+	up_write(&vol->mftbmp_lock);
+	/*
+	 * We now have allocated and initialized the mft record.  Calculate the
+	 * index of and the offset within the page cache page the record is in.
+	 */
+	index = bit << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
+	ofs = (bit << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+	/* Read, map, and pin the page containing the mft record. */
+	page = ntfs_map_page(vol->mft_ino->i_mapping, index);
+	if (unlikely(IS_ERR(page))) {
+		ntfs_error(vol->sb, "Failed to map page containing allocated "
+				"mft record 0x%llx.", (long long)bit);
+		err = PTR_ERR(page);
+		goto undo_mftbmp_alloc;
+	}
+	lock_page(page);
+	BUG_ON(!PageUptodate(page));
+	ClearPageUptodate(page);
+	m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
+	/* If we just formatted the mft record no need to do it again. */
+	if (!record_formatted) {
+		/* Sanity check that the mft record is really not in use. */
+		if (ntfs_is_file_record(m->magic) &&
+				(m->flags & MFT_RECORD_IN_USE)) {
+			ntfs_error(vol->sb, "Mft record 0x%llx was marked "
+					"free in mft bitmap but is marked "
+					"used itself.  Corrupt filesystem.  "
+					"Unmount and run chkdsk.",
+					(long long)bit);
+			err = -EIO;
+			SetPageUptodate(page);
+			unlock_page(page);
+			ntfs_unmap_page(page);
+			NVolSetErrors(vol);
+			goto undo_mftbmp_alloc;
+		}
+		/*
+		 * We need to (re-)format the mft record, preserving the
+		 * sequence number if it is not zero as well as the update
+		 * sequence number if it is not zero or -1 (0xffff).  This
+		 * means we do not need to care whether or not something went
+		 * wrong with the previous mft record.
+		 */
+		seq_no = m->sequence_number;
+		usn = *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs));
+		err = ntfs_mft_record_layout(vol, bit, m);
+		if (unlikely(err)) {
+			ntfs_error(vol->sb, "Failed to layout allocated mft "
+					"record 0x%llx.", (long long)bit);
+			SetPageUptodate(page);
+			unlock_page(page);
+			ntfs_unmap_page(page);
+			goto undo_mftbmp_alloc;
+		}
+		if (seq_no)
+			m->sequence_number = seq_no;
+		if (usn && le16_to_cpu(usn) != 0xffff)
+			*(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = usn;
+	}
+	/* Set the mft record itself in use. */
+	m->flags |= MFT_RECORD_IN_USE;
+	if (S_ISDIR(mode))
+		m->flags |= MFT_RECORD_IS_DIRECTORY;
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	if (base_ni) {
+		/*
+		 * Setup the base mft record in the extent mft record.  This
+		 * completes initialization of the allocated extent mft record
+		 * and we can simply use it with map_extent_mft_record().
+		 */
+		m->base_mft_record = MK_LE_MREF(base_ni->mft_no,
+				base_ni->seq_no);
+		/*
+		 * Allocate an extent inode structure for the new mft record,
+		 * attach it to the base inode @base_ni and map, pin, and lock
+		 * its, i.e. the allocated, mft record.
+		 */
+		m = map_extent_mft_record(base_ni, bit, &ni);
+		if (IS_ERR(m)) {
+			ntfs_error(vol->sb, "Failed to map allocated extent "
+					"mft record 0x%llx.", (long long)bit);
+			err = PTR_ERR(m);
+			/* Set the mft record itself not in use. */
+			m->flags &= cpu_to_le16(
+					~le16_to_cpu(MFT_RECORD_IN_USE));
+			flush_dcache_page(page);
+			/* Make sure the mft record is written out to disk. */
+			mark_ntfs_record_dirty(page, ofs);
+			unlock_page(page);
+			ntfs_unmap_page(page);
+			goto undo_mftbmp_alloc;
+		}
+		/*
+		 * Make sure the allocated mft record is written out to disk.
+		 * No need to set the inode dirty because the caller is going
+		 * to do that anyway after finishing with the new extent mft
+		 * record (e.g. at a minimum a new attribute will be added to
+		 * the mft record.
+		 */
+		mark_ntfs_record_dirty(page, ofs);
+		unlock_page(page);
+		/*
+		 * Need to unmap the page since map_extent_mft_record() mapped
+		 * it as well so we have it mapped twice at the moment.
+		 */
+		ntfs_unmap_page(page);
+	} else {
+		/*
+		 * Allocate a new VFS inode and set it up.  NOTE: @vi->i_nlink
+		 * is set to 1 but the mft record->link_count is 0.  The caller
+		 * needs to bear this in mind.
+		 */
+		vi = new_inode(vol->sb);
+		if (unlikely(!vi)) {
+			err = -ENOMEM;
+			/* Set the mft record itself not in use. */
+			m->flags &= cpu_to_le16(
+					~le16_to_cpu(MFT_RECORD_IN_USE));
+			flush_dcache_page(page);
+			/* Make sure the mft record is written out to disk. */
+			mark_ntfs_record_dirty(page, ofs);
+			unlock_page(page);
+			ntfs_unmap_page(page);
+			goto undo_mftbmp_alloc;
+		}
+		vi->i_ino = bit;
+		/*
+		 * This is the optimal IO size (for stat), not the fs block
+		 * size.
+		 */
+		vi->i_blksize = PAGE_CACHE_SIZE;
+		/*
+		 * This is for checking whether an inode has changed w.r.t. a
+		 * file so that the file can be updated if necessary (compare
+		 * with f_version).
+		 */
+		vi->i_version = 1;
+
+		/* The owner and group come from the ntfs volume. */
+		vi->i_uid = vol->uid;
+		vi->i_gid = vol->gid;
+
+		/* Initialize the ntfs specific part of @vi. */
+		ntfs_init_big_inode(vi);
+		ni = NTFS_I(vi);
+		/*
+		 * Set the appropriate mode, attribute type, and name.  For
+		 * directories, also setup the index values to the defaults.
+		 */
+		if (S_ISDIR(mode)) {
+			vi->i_mode = S_IFDIR | S_IRWXUGO;
+			vi->i_mode &= ~vol->dmask;
+
+			NInoSetMstProtected(ni);
+			ni->type = AT_INDEX_ALLOCATION;
+			ni->name = I30;
+			ni->name_len = 4;
+
+			ni->itype.index.block_size = 4096;
+			ni->itype.index.block_size_bits = generic_ffs(4096) - 1;
+			ni->itype.index.collation_rule = COLLATION_FILE_NAME;
+			if (vol->cluster_size <= ni->itype.index.block_size) {
+				ni->itype.index.vcn_size = vol->cluster_size;
+				ni->itype.index.vcn_size_bits =
+						vol->cluster_size_bits;
+			} else {
+				ni->itype.index.vcn_size = vol->sector_size;
+				ni->itype.index.vcn_size_bits =
+						vol->sector_size_bits;
+			}
+		} else {
+			vi->i_mode = S_IFREG | S_IRWXUGO;
+			vi->i_mode &= ~vol->fmask;
+
+			ni->type = AT_DATA;
+			ni->name = NULL;
+			ni->name_len = 0;
+		}
+		if (IS_RDONLY(vi))
+			vi->i_mode &= ~S_IWUGO;
+
+		/* Set the inode times to the current time. */
+		vi->i_atime = vi->i_mtime = vi->i_ctime = current_kernel_time();
+		/*
+		 * Set the file size to 0, the ntfs inode sizes are set to 0 by
+		 * the call to ntfs_init_big_inode() below.
+		 */
+		vi->i_size = 0;
+		vi->i_blocks = 0;
+
+		/* Set the sequence number. */
+		vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
+		/*
+		 * Manually map, pin, and lock the mft record as we already
+		 * have its page mapped and it is very easy to do.
+		 */
+		atomic_inc(&ni->count);
+		down(&ni->mrec_lock);
+		ni->page = page;
+		ni->page_ofs = ofs;
+		/*
+		 * Make sure the allocated mft record is written out to disk.
+		 * NOTE: We do not set the ntfs inode dirty because this would
+		 * fail in ntfs_write_inode() because the inode does not have a
+		 * standard information attribute yet.  Also, there is no need
+		 * to set the inode dirty because the caller is going to do
+		 * that anyway after finishing with the new mft record (e.g. at
+		 * a minimum some new attributes will be added to the mft
+		 * record.
+		 */
+		mark_ntfs_record_dirty(page, ofs);
+		unlock_page(page);
+
+		/* Add the inode to the inode hash for the superblock. */
+		insert_inode_hash(vi);
+
+		/* Update the default mft allocation position. */
+		vol->mft_data_pos = bit + 1;
+	}
+	/*
+	 * Return the opened, allocated inode of the allocated mft record as
+	 * well as the mapped, pinned, and locked mft record.
+	 */
+	ntfs_debug("Returning opened, allocated %sinode 0x%llx.",
+			base_ni ? "extent " : "", (long long)bit);
+	*mrec = m;
+	return ni;
+undo_data_init:
+	mft_ni->initialized_size = old_data_initialized;
+	vol->mft_ino->i_size = old_data_size;
+	goto undo_mftbmp_alloc_nolock;
+undo_mftbmp_alloc:
+	down_write(&vol->mftbmp_lock);
+undo_mftbmp_alloc_nolock:
+	if (ntfs_bitmap_clear_bit(vol->mftbmp_ino, bit)) {
+		ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
+		NVolSetErrors(vol);
+	}
+	up_write(&vol->mftbmp_lock);
+err_out:
+	return ERR_PTR(err);
+max_err_out:
+	ntfs_warning(vol->sb, "Cannot allocate mft record because the maximum "
+			"number of inodes (2^32) has already been reached.");
+	up_write(&vol->mftbmp_lock);
+	return ERR_PTR(-ENOSPC);
+}
+
+/**
+ * ntfs_extent_mft_record_free - free an extent mft record on an ntfs volume
+ * @ni:		ntfs inode of the mapped extent mft record to free
+ * @m:		mapped extent mft record of the ntfs inode @ni
+ *
+ * Free the mapped extent mft record @m of the extent ntfs inode @ni.
+ *
+ * Note that this function unmaps the mft record and closes and destroys @ni
+ * internally and hence you cannot use either @ni nor @m any more after this
+ * function returns success.
+ *
+ * On success return 0 and on error return -errno.  @ni and @m are still valid
+ * in this case and have not been freed.
+ *
+ * For some errors an error message is displayed and the success code 0 is
+ * returned and the volume is then left dirty on umount.  This makes sense in
+ * case we could not rollback the changes that were already done since the
+ * caller no longer wants to reference this mft record so it does not matter to
+ * the caller if something is wrong with it as long as it is properly detached
+ * from the base inode.
+ */
+int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m)
+{
+	unsigned long mft_no = ni->mft_no;
+	ntfs_volume *vol = ni->vol;
+	ntfs_inode *base_ni;
+	ntfs_inode **extent_nis;
+	int i, err;
+	le16 old_seq_no;
+	u16 seq_no;
+	
+	BUG_ON(NInoAttr(ni));
+	BUG_ON(ni->nr_extents != -1);
+
+	down(&ni->extent_lock);
+	base_ni = ni->ext.base_ntfs_ino;
+	up(&ni->extent_lock);
+
+	BUG_ON(base_ni->nr_extents <= 0);
+
+	ntfs_debug("Entering for extent inode 0x%lx, base inode 0x%lx.\n",
+			mft_no, base_ni->mft_no);
+
+	down(&base_ni->extent_lock);
+
+	/* Make sure we are holding the only reference to the extent inode. */
+	if (atomic_read(&ni->count) > 2) {
+		ntfs_error(vol->sb, "Tried to free busy extent inode 0x%lx, "
+				"not freeing.", base_ni->mft_no);
+		up(&base_ni->extent_lock);
+		return -EBUSY;
+	}
+
+	/* Dissociate the ntfs inode from the base inode. */
+	extent_nis = base_ni->ext.extent_ntfs_inos;
+	err = -ENOENT;
+	for (i = 0; i < base_ni->nr_extents; i++) {
+		if (ni != extent_nis[i])
+			continue;
+		extent_nis += i;
+		base_ni->nr_extents--;
+		memmove(extent_nis, extent_nis + 1, (base_ni->nr_extents - i) *
+				sizeof(ntfs_inode*));
+		err = 0;
+		break;
+	}
+
+	up(&base_ni->extent_lock);
+
+	if (unlikely(err)) {
+		ntfs_error(vol->sb, "Extent inode 0x%lx is not attached to "
+				"its base inode 0x%lx.", mft_no,
+				base_ni->mft_no);
+		BUG();
+	}
+
+	/*
+	 * The extent inode is no longer attached to the base inode so no one
+	 * can get a reference to it any more.
+	 */
+
+	/* Mark the mft record as not in use. */
+	m->flags &= const_cpu_to_le16(~const_le16_to_cpu(MFT_RECORD_IN_USE));
+
+	/* Increment the sequence number, skipping zero, if it is not zero. */
+	old_seq_no = m->sequence_number;
+	seq_no = le16_to_cpu(old_seq_no);
+	if (seq_no == 0xffff)
+		seq_no = 1;
+	else if (seq_no)
+		seq_no++;
+	m->sequence_number = cpu_to_le16(seq_no);
+
+	/*
+	 * Set the ntfs inode dirty and write it out.  We do not need to worry
+	 * about the base inode here since whatever caused the extent mft
+	 * record to be freed is guaranteed to do it already.
+	 */
+	NInoSetDirty(ni);
+	err = write_mft_record(ni, m, 0);
+	if (unlikely(err)) {
+		ntfs_error(vol->sb, "Failed to write mft record 0x%lx, not "
+				"freeing.", mft_no);
+		goto rollback;
+	}
+rollback_error:
+	/* Unmap and throw away the now freed extent inode. */
+	unmap_extent_mft_record(ni);
+	ntfs_clear_extent_inode(ni);
+
+	/* Clear the bit in the $MFT/$BITMAP corresponding to this record. */
+	down_write(&vol->mftbmp_lock);
+	err = ntfs_bitmap_clear_bit(vol->mftbmp_ino, mft_no);
+	up_write(&vol->mftbmp_lock);
+	if (unlikely(err)) {
+		/*
+		 * The extent inode is gone but we failed to deallocate it in
+		 * the mft bitmap.  Just emit a warning and leave the volume
+		 * dirty on umount.
+		 */
+		ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
+		NVolSetErrors(vol);
+	}
+	return 0;
+rollback:
+	/* Rollback what we did... */
+	down(&base_ni->extent_lock);
+	extent_nis = base_ni->ext.extent_ntfs_inos;
+	if (!(base_ni->nr_extents & 3)) {
+		int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*);
+
+		extent_nis = (ntfs_inode**)kmalloc(new_size, GFP_NOFS);
+		if (unlikely(!extent_nis)) {
+			ntfs_error(vol->sb, "Failed to allocate internal "
+					"buffer during rollback.%s", es);
+			up(&base_ni->extent_lock);
+			NVolSetErrors(vol);
+			goto rollback_error;
+		}
+		if (base_ni->nr_extents) {
+			BUG_ON(!base_ni->ext.extent_ntfs_inos);
+			memcpy(extent_nis, base_ni->ext.extent_ntfs_inos,
+					new_size - 4 * sizeof(ntfs_inode*));
+			kfree(base_ni->ext.extent_ntfs_inos);
+		}
+		base_ni->ext.extent_ntfs_inos = extent_nis;
+	}
+	m->flags |= MFT_RECORD_IN_USE;
+	m->sequence_number = old_seq_no;
+	extent_nis[base_ni->nr_extents++] = ni;
+	up(&base_ni->extent_lock);
+	mark_mft_record_dirty(ni);
+	return err;
+}
 #endif /* NTFS_RW */
diff --git a/fs/ntfs/mft.h b/fs/ntfs/mft.h
index 4fd9b5ec6e0c..407de2cef1d6 100644
--- a/fs/ntfs/mft.h
+++ b/fs/ntfs/mft.h
@@ -24,11 +24,11 @@
 #define _LINUX_NTFS_MFT_H
 
 #include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
 
 #include "inode.h"
 
-extern int format_mft_record(ntfs_inode *ni, MFT_RECORD *m);
-
 extern MFT_RECORD *map_mft_record(ntfs_inode *ni);
 extern void unmap_mft_record(ntfs_inode *ni);
 
@@ -76,6 +76,9 @@ static inline void mark_mft_record_dirty(ntfs_inode *ni)
 		__mark_mft_record_dirty(ni);
 }
 
+extern int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
+		MFT_RECORD *m, int sync);
+
 extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync);
 
 /**
@@ -111,6 +114,14 @@ static inline int write_mft_record(ntfs_inode *ni, MFT_RECORD *m, int sync)
 	return err;
 }
 
+extern BOOL ntfs_may_write_mft_record(ntfs_volume *vol,
+		const unsigned long mft_no, const MFT_RECORD *m,
+		ntfs_inode **locked_ni);
+
+extern ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
+		ntfs_inode *base_ni, MFT_RECORD **mrec);
+extern int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m);
+
 #endif /* NTFS_RW */
 
 #endif /* _LINUX_NTFS_MFT_H */
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 70b503a85f4d..ac5997bec5cd 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -23,8 +23,11 @@
 #include <linux/dcache.h>
 #include <linux/security.h>
 
-#include "ntfs.h"
+#include "attrib.h"
+#include "debug.h"
 #include "dir.h"
+#include "mft.h"
+#include "ntfs.h"
 
 /**
  * ntfs_lookup - find the inode represented by a dentry in a directory inode
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
index c3fda17158e6..fb71b9fff857 100644
--- a/fs/ntfs/ntfs.h
+++ b/fs/ntfs/ntfs.h
@@ -29,21 +29,12 @@
 #include <linux/module.h>
 #include <linux/compiler.h>
 #include <linux/fs.h>
-#include <linux/buffer_head.h>
 #include <linux/nls.h>
-#include <linux/pagemap.h>
 #include <linux/smp.h>
-#include <asm/atomic.h>
 
 #include "types.h"
-#include "debug.h"
-#include "malloc.h"
-#include "endian.h"
 #include "volume.h"
-#include "inode.h"
 #include "layout.h"
-#include "attrib.h"
-#include "mft.h"
 
 typedef enum {
 	NTFS_BLOCK_SIZE		= 512,
@@ -65,7 +56,6 @@ extern kmem_cache_t *ntfs_index_ctx_cache;
 extern struct super_operations ntfs_sops;
 extern struct address_space_operations ntfs_aops;
 extern struct address_space_operations ntfs_mst_aops;
-extern struct address_space_operations ntfs_mft_aops;
 
 extern struct  file_operations ntfs_file_ops;
 extern struct inode_operations ntfs_file_inode_ops;
@@ -87,72 +77,12 @@ static inline ntfs_volume *NTFS_SB(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
-/**
- * ntfs_unmap_page - release a page that was mapped using ntfs_map_page()
- * @page:	the page to release
- *
- * Unpin, unmap and release a page that was obtained from ntfs_map_page().
- */
-static inline void ntfs_unmap_page(struct page *page)
-{
-	kunmap(page);
-	page_cache_release(page);
-}
-
-/**
- * ntfs_map_page - map a page into accessible memory, reading it if necessary
- * @mapping:	address space for which to obtain the page
- * @index:	index into the page cache for @mapping of the page to map
- *
- * Read a page from the page cache of the address space @mapping at position
- * @index, where @index is in units of PAGE_CACHE_SIZE, and not in bytes.
- *
- * If the page is not in memory it is loaded from disk first using the readpage
- * method defined in the address space operations of @mapping and the page is
- * added to the page cache of @mapping in the process.
- *
- * If the page is in high memory it is mapped into memory directly addressible
- * by the kernel.
- *
- * Finally the page count is incremented, thus pinning the page into place.
- *
- * The above means that page_address(page) can be used on all pages obtained
- * with ntfs_map_page() to get the kernel virtual address of the page.
- *
- * When finished with the page, the caller has to call ntfs_unmap_page() to
- * unpin, unmap and release the page.
- *
- * Note this does not grant exclusive access. If such is desired, the caller
- * must provide it independently of the ntfs_{un}map_page() calls by using
- * a {rw_}semaphore or other means of serialization. A spin lock cannot be
- * used as ntfs_map_page() can block.
- *
- * The unlocked and uptodate page is returned on success or an encoded error
- * on failure. Caller has to test for error using the IS_ERR() macro on the
- * return value. If that evaluates to TRUE, the negative error code can be
- * obtained using PTR_ERR() on the return value of ntfs_map_page().
- */
-static inline struct page *ntfs_map_page(struct address_space *mapping,
-		unsigned long index)
-{
-	struct page *page = read_cache_page(mapping, index,
-			(filler_t*)mapping->a_ops->readpage, NULL);
-
-	if (!IS_ERR(page)) {
-		wait_on_page_locked(page);
-		kmap(page);
-		if (PageUptodate(page) && !PageError(page))
-			return page;
-		ntfs_unmap_page(page);
-		return ERR_PTR(-EIO);
-	}
-	return page;
-}
-
 /* Declarations of functions and global variables. */
 
 /* From fs/ntfs/compress.c */
 extern int ntfs_read_compressed_block(struct page *page);
+extern int allocate_compression_buffers(void);
+extern void free_compression_buffers(void);
 
 /* From fs/ntfs/super.c */
 #define default_upcase_len 0x10000
@@ -166,10 +96,6 @@ typedef struct {
 } option_t;
 extern const option_t on_errors_arr[];
 
-/* From fs/ntfs/compress.c */
-extern int allocate_compression_buffers(void);
-extern void free_compression_buffers(void);
-
 /* From fs/ntfs/mst.c */
 extern int post_read_mst_fixup(NTFS_RECORD *b, const u32 size);
 extern int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size);
diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c
index b72a85dd446f..8764ebd8d063 100644
--- a/fs/ntfs/quota.c
+++ b/fs/ntfs/quota.c
@@ -22,9 +22,10 @@
 
 #ifdef NTFS_RW
 
-#include "ntfs.h"
 #include "index.h"
 #include "quota.h"
+#include "debug.h"
+#include "ntfs.h"
 
 /**
  * ntfs_mark_quotas_out_of_date - mark the quotas out of date on an ntfs volume
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
new file mode 100644
index 000000000000..50bfb6b5b3e0
--- /dev/null
+++ b/fs/ntfs/runlist.c
@@ -0,0 +1,1462 @@
+/**
+ * runlist.c - NTFS runlist handling code.  Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "debug.h"
+#include "dir.h"
+#include "endian.h"
+#include "malloc.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_rl_mm - runlist memmove
+ *
+ * It is up to the caller to serialize access to the runlist @base.
+ */
+static inline void ntfs_rl_mm(runlist_element *base, int dst, int src,
+		int size)
+{
+	if (likely((dst != src) && (size > 0)))
+		memmove(base + dst, base + src, size * sizeof (*base));
+}
+
+/**
+ * ntfs_rl_mc - runlist memory copy
+ *
+ * It is up to the caller to serialize access to the runlists @dstbase and
+ * @srcbase.
+ */
+static inline void ntfs_rl_mc(runlist_element *dstbase, int dst,
+		runlist_element *srcbase, int src, int size)
+{
+	if (likely(size > 0))
+		memcpy(dstbase + dst, srcbase + src, size * sizeof(*dstbase));
+}
+
+/**
+ * ntfs_rl_realloc - Reallocate memory for runlists
+ * @rl:		original runlist
+ * @old_size:	number of runlist elements in the original runlist @rl
+ * @new_size:	number of runlist elements we need space for
+ *
+ * As the runlists grow, more memory will be required.  To prevent the
+ * kernel having to allocate and reallocate large numbers of small bits of
+ * memory, this function returns and entire page of memory.
+ *
+ * It is up to the caller to serialize access to the runlist @rl.
+ *
+ * N.B.  If the new allocation doesn't require a different number of pages in
+ *       memory, the function will return the original pointer.
+ *
+ * On success, return a pointer to the newly allocated, or recycled, memory.
+ * On error, return -errno. The following error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EINVAL	- Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_realloc(runlist_element *rl,
+		int old_size, int new_size)
+{
+	runlist_element *new_rl;
+
+	old_size = PAGE_ALIGN(old_size * sizeof(*rl));
+	new_size = PAGE_ALIGN(new_size * sizeof(*rl));
+	if (old_size == new_size)
+		return rl;
+
+	new_rl = ntfs_malloc_nofs(new_size);
+	if (unlikely(!new_rl))
+		return ERR_PTR(-ENOMEM);
+
+	if (likely(rl != NULL)) {
+		if (unlikely(old_size > new_size))
+			old_size = new_size;
+		memcpy(new_rl, rl, old_size);
+		ntfs_free(rl);
+	}
+	return new_rl;
+}
+
+/**
+ * ntfs_are_rl_mergeable - test if two runlists can be joined together
+ * @dst:	original runlist
+ * @src:	new runlist to test for mergeability with @dst
+ *
+ * Test if two runlists can be joined together. For this, their VCNs and LCNs
+ * must be adjacent.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * Return: TRUE   Success, the runlists can be merged.
+ *	   FALSE  Failure, the runlists cannot be merged.
+ */
+static inline BOOL ntfs_are_rl_mergeable(runlist_element *dst,
+		runlist_element *src)
+{
+	BUG_ON(!dst);
+	BUG_ON(!src);
+
+	if ((dst->lcn < 0) || (src->lcn < 0))     /* Are we merging holes? */
+		return FALSE;
+	if ((dst->lcn + dst->length) != src->lcn) /* Are the runs contiguous? */
+		return FALSE;
+	if ((dst->vcn + dst->length) != src->vcn) /* Are the runs misaligned? */
+		return FALSE;
+
+	return TRUE;
+}
+
+/**
+ * __ntfs_rl_merge - merge two runlists without testing if they can be merged
+ * @dst:	original, destination runlist
+ * @src:	new runlist to merge with @dst
+ *
+ * Merge the two runlists, writing into the destination runlist @dst. The
+ * caller must make sure the runlists can be merged or this will corrupt the
+ * destination runlist.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ */
+static inline void __ntfs_rl_merge(runlist_element *dst, runlist_element *src)
+{
+	dst->length += src->length;
+}
+
+/**
+ * ntfs_rl_merge - test if two runlists can be joined together and merge them
+ * @dst:	original, destination runlist
+ * @src:	new runlist to merge with @dst
+ *
+ * Test if two runlists can be joined together. For this, their VCNs and LCNs
+ * must be adjacent. If they can be merged, perform the merge, writing into
+ * the destination runlist @dst.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * Return: TRUE   Success, the runlists have been merged.
+ *	   FALSE  Failure, the runlists cannot be merged and have not been
+ *		  modified.
+ */
+static inline BOOL ntfs_rl_merge(runlist_element *dst, runlist_element *src)
+{
+	BOOL merge = ntfs_are_rl_mergeable(dst, src);
+
+	if (merge)
+		__ntfs_rl_merge(dst, src);
+	return merge;
+}
+
+/**
+ * ntfs_rl_append - append a runlist after a given element
+ * @dst:	original runlist to be worked on
+ * @dsize:	number of elements in @dst (including end marker)
+ * @src:	runlist to be inserted into @dst
+ * @ssize:	number of elements in @src (excluding end marker)
+ * @loc:	append the new runlist @src after this element in @dst
+ *
+ * Append the runlist @src after element @loc in @dst.  Merge the right end of
+ * the new runlist, if necessary. Adjust the size of the hole before the
+ * appended runlist.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @dst and @src are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EINVAL	- Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_append(runlist_element *dst,
+		int dsize, runlist_element *src, int ssize, int loc)
+{
+	BOOL right;
+	int magic;
+
+	BUG_ON(!dst);
+	BUG_ON(!src);
+
+	/* First, check if the right hand end needs merging. */
+	right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
+
+	/* Space required: @dst size + @src size, less one if we merged. */
+	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - right);
+	if (IS_ERR(dst))
+		return dst;
+	/*
+	 * We are guaranteed to succeed from here so can start modifying the
+	 * original runlists.
+	 */
+
+	/* First, merge the right hand end, if necessary. */
+	if (right)
+		__ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
+
+	magic = loc + ssize;
+
+	/* Move the tail of @dst out of the way, then copy in @src. */
+	ntfs_rl_mm(dst, magic + 1, loc + 1 + right, dsize - loc - 1 - right);
+	ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
+
+	/* Adjust the size of the preceding hole. */
+	dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn;
+
+	/* We may have changed the length of the file, so fix the end marker */
+	if (dst[magic + 1].lcn == LCN_ENOENT)
+		dst[magic + 1].vcn = dst[magic].vcn + dst[magic].length;
+
+	return dst;
+}
+
+/**
+ * ntfs_rl_insert - insert a runlist into another
+ * @dst:	original runlist to be worked on
+ * @dsize:	number of elements in @dst (including end marker)
+ * @src:	new runlist to be inserted
+ * @ssize:	number of elements in @src (excluding end marker)
+ * @loc:	insert the new runlist @src before this element in @dst
+ *
+ * Insert the runlist @src before element @loc in the runlist @dst. Merge the
+ * left end of the new runlist, if necessary. Adjust the size of the hole
+ * after the inserted runlist.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @dst and @src are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EINVAL	- Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_insert(runlist_element *dst,
+		int dsize, runlist_element *src, int ssize, int loc)
+{
+	BOOL left = FALSE;
+	BOOL disc = FALSE;	/* Discontinuity */
+	BOOL hole = FALSE;	/* Following a hole */
+	int magic;
+
+	BUG_ON(!dst);
+	BUG_ON(!src);
+
+	/* disc => Discontinuity between the end of @dst and the start of @src.
+	 *	   This means we might need to insert a hole.
+	 * hole => @dst ends with a hole or an unmapped region which we can
+	 *	   extend to match the discontinuity. */
+	if (loc == 0)
+		disc = (src[0].vcn > 0);
+	else {
+		s64 merged_length;
+
+		left = ntfs_are_rl_mergeable(dst + loc - 1, src);
+
+		merged_length = dst[loc - 1].length;
+		if (left)
+			merged_length += src->length;
+
+		disc = (src[0].vcn > dst[loc - 1].vcn + merged_length);
+		if (disc)
+			hole = (dst[loc - 1].lcn == LCN_HOLE);
+	}
+
+	/* Space required: @dst size + @src size, less one if we merged, plus
+	 * one if there was a discontinuity, less one for a trailing hole. */
+	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left + disc - hole);
+	if (IS_ERR(dst))
+		return dst;
+	/*
+	 * We are guaranteed to succeed from here so can start modifying the
+	 * original runlist.
+	 */
+
+	if (left)
+		__ntfs_rl_merge(dst + loc - 1, src);
+
+	magic = loc + ssize - left + disc - hole;
+
+	/* Move the tail of @dst out of the way, then copy in @src. */
+	ntfs_rl_mm(dst, magic, loc, dsize - loc);
+	ntfs_rl_mc(dst, loc + disc - hole, src, left, ssize - left);
+
+	/* Adjust the VCN of the last run ... */
+	if (dst[magic].lcn <= LCN_HOLE)
+		dst[magic].vcn = dst[magic - 1].vcn + dst[magic - 1].length;
+	/* ... and the length. */
+	if (dst[magic].lcn == LCN_HOLE || dst[magic].lcn == LCN_RL_NOT_MAPPED)
+		dst[magic].length = dst[magic + 1].vcn - dst[magic].vcn;
+
+	/* Writing beyond the end of the file and there's a discontinuity. */
+	if (disc) {
+		if (hole)
+			dst[loc - 1].length = dst[loc].vcn - dst[loc - 1].vcn;
+		else {
+			if (loc > 0) {
+				dst[loc].vcn = dst[loc - 1].vcn +
+						dst[loc - 1].length;
+				dst[loc].length = dst[loc + 1].vcn -
+						dst[loc].vcn;
+			} else {
+				dst[loc].vcn = 0;
+				dst[loc].length = dst[loc + 1].vcn;
+			}
+			dst[loc].lcn = LCN_RL_NOT_MAPPED;
+		}
+
+		magic += hole;
+
+		if (dst[magic].lcn == LCN_ENOENT)
+			dst[magic].vcn = dst[magic - 1].vcn +
+					dst[magic - 1].length;
+	}
+	return dst;
+}
+
+/**
+ * ntfs_rl_replace - overwrite a runlist element with another runlist
+ * @dst:	original runlist to be worked on
+ * @dsize:	number of elements in @dst (including end marker)
+ * @src:	new runlist to be inserted
+ * @ssize:	number of elements in @src (excluding end marker)
+ * @loc:	index in runlist @dst to overwrite with @src
+ *
+ * Replace the runlist element @dst at @loc with @src. Merge the left and
+ * right ends of the inserted runlist, if necessary.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @dst and @src are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EINVAL	- Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_replace(runlist_element *dst,
+		int dsize, runlist_element *src, int ssize, int loc)
+{
+	BOOL left = FALSE;
+	BOOL right;
+	int magic;
+
+	BUG_ON(!dst);
+	BUG_ON(!src);
+
+	/* First, merge the left and right ends, if necessary. */
+	right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
+	if (loc > 0)
+		left = ntfs_are_rl_mergeable(dst + loc - 1, src);
+
+	/* Allocate some space. We'll need less if the left, right, or both
+	 * ends were merged. */
+	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left - right);
+	if (IS_ERR(dst))
+		return dst;
+	/*
+	 * We are guaranteed to succeed from here so can start modifying the
+	 * original runlists.
+	 */
+	if (right)
+		__ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
+	if (left)
+		__ntfs_rl_merge(dst + loc - 1, src);
+
+	/* FIXME: What does this mean? (AIA) */
+	magic = loc + ssize - left;
+
+	/* Move the tail of @dst out of the way, then copy in @src. */
+	ntfs_rl_mm(dst, magic, loc + right + 1, dsize - loc - right - 1);
+	ntfs_rl_mc(dst, loc, src, left, ssize - left);
+
+	/* We may have changed the length of the file, so fix the end marker */
+	if (dst[magic].lcn == LCN_ENOENT)
+		dst[magic].vcn = dst[magic - 1].vcn + dst[magic - 1].length;
+	return dst;
+}
+
+/**
+ * ntfs_rl_split - insert a runlist into the centre of a hole
+ * @dst:	original runlist to be worked on
+ * @dsize:	number of elements in @dst (including end marker)
+ * @src:	new runlist to be inserted
+ * @ssize:	number of elements in @src (excluding end marker)
+ * @loc:	index in runlist @dst at which to split and insert @src
+ *
+ * Split the runlist @dst at @loc into two and insert @new in between the two
+ * fragments. No merging of runlists is necessary. Adjust the size of the
+ * holes either side.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @dst and @src are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EINVAL	- Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_split(runlist_element *dst, int dsize,
+		runlist_element *src, int ssize, int loc)
+{
+	BUG_ON(!dst);
+	BUG_ON(!src);
+
+	/* Space required: @dst size + @src size + one new hole. */
+	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize + 1);
+	if (IS_ERR(dst))
+		return dst;
+	/*
+	 * We are guaranteed to succeed from here so can start modifying the
+	 * original runlists.
+	 */
+
+	/* Move the tail of @dst out of the way, then copy in @src. */
+	ntfs_rl_mm(dst, loc + 1 + ssize, loc, dsize - loc);
+	ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
+
+	/* Adjust the size of the holes either size of @src. */
+	dst[loc].length		= dst[loc+1].vcn       - dst[loc].vcn;
+	dst[loc+ssize+1].vcn    = dst[loc+ssize].vcn   + dst[loc+ssize].length;
+	dst[loc+ssize+1].length = dst[loc+ssize+2].vcn - dst[loc+ssize+1].vcn;
+
+	return dst;
+}
+
+/**
+ * ntfs_runlists_merge - merge two runlists into one
+ * @drl:	original runlist to be worked on
+ * @srl:	new runlist to be merged into @drl
+ *
+ * First we sanity check the two runlists @srl and @drl to make sure that they
+ * are sensible and can be merged. The runlist @srl must be either after the
+ * runlist @drl or completely within a hole (or unmapped region) in @drl.
+ *
+ * It is up to the caller to serialize access to the runlists @drl and @srl.
+ *
+ * Merging of runlists is necessary in two cases:
+ *   1. When attribute lists are used and a further extent is being mapped.
+ *   2. When new clusters are allocated to fill a hole or extend a file.
+ *
+ * There are four possible ways @srl can be merged. It can:
+ *	- be inserted at the beginning of a hole,
+ *	- split the hole in two and be inserted between the two fragments,
+ *	- be appended at the end of a hole, or it can
+ *	- replace the whole hole.
+ * It can also be appended to the end of the runlist, which is just a variant
+ * of the insert case.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @drl and @srl are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EINVAL	- Invalid parameters were passed in.
+ *	-ERANGE	- The runlists overlap and cannot be merged.
+ */
+runlist_element *ntfs_runlists_merge(runlist_element *drl,
+		runlist_element *srl)
+{
+	int di, si;		/* Current index into @[ds]rl. */
+	int sstart;		/* First index with lcn > LCN_RL_NOT_MAPPED. */
+	int dins;		/* Index into @drl at which to insert @srl. */
+	int dend, send;		/* Last index into @[ds]rl. */
+	int dfinal, sfinal;	/* The last index into @[ds]rl with
+				   lcn >= LCN_HOLE. */
+	int marker = 0;
+	VCN marker_vcn = 0;
+
+#ifdef DEBUG
+	ntfs_debug("dst:");
+	ntfs_debug_dump_runlist(drl);
+	ntfs_debug("src:");
+	ntfs_debug_dump_runlist(srl);
+#endif
+
+	/* Check for silly calling... */
+	if (unlikely(!srl))
+		return drl;
+	if (IS_ERR(srl) || IS_ERR(drl))
+		return ERR_PTR(-EINVAL);
+
+	/* Check for the case where the first mapping is being done now. */
+	if (unlikely(!drl)) {
+		drl = srl;
+		/* Complete the source runlist if necessary. */
+		if (unlikely(drl[0].vcn)) {
+			/* Scan to the end of the source runlist. */
+			for (dend = 0; likely(drl[dend].length); dend++)
+				;
+			drl = ntfs_rl_realloc(drl, dend, dend + 1);
+			if (IS_ERR(drl))
+				return drl;
+			/* Insert start element at the front of the runlist. */
+			ntfs_rl_mm(drl, 1, 0, dend);
+			drl[0].vcn = 0;
+			drl[0].lcn = LCN_RL_NOT_MAPPED;
+			drl[0].length = drl[1].vcn;
+		}
+		goto finished;
+	}
+
+	si = di = 0;
+
+	/* Skip any unmapped start element(s) in the source runlist. */
+	while (srl[si].length && srl[si].lcn < LCN_HOLE)
+		si++;
+
+	/* Can't have an entirely unmapped source runlist. */
+	BUG_ON(!srl[si].length);
+
+	/* Record the starting points. */
+	sstart = si;
+
+	/*
+	 * Skip forward in @drl until we reach the position where @srl needs to
+	 * be inserted. If we reach the end of @drl, @srl just needs to be
+	 * appended to @drl.
+	 */
+	for (; drl[di].length; di++) {
+		if (drl[di].vcn + drl[di].length > srl[sstart].vcn)
+			break;
+	}
+	dins = di;
+
+	/* Sanity check for illegal overlaps. */
+	if ((drl[di].vcn == srl[si].vcn) && (drl[di].lcn >= 0) &&
+			(srl[si].lcn >= 0)) {
+		ntfs_error(NULL, "Run lists overlap. Cannot merge!");
+		return ERR_PTR(-ERANGE);
+	}
+
+	/* Scan to the end of both runlists in order to know their sizes. */
+	for (send = si; srl[send].length; send++)
+		;
+	for (dend = di; drl[dend].length; dend++)
+		;
+
+	if (srl[send].lcn == LCN_ENOENT)
+		marker_vcn = srl[marker = send].vcn;
+
+	/* Scan to the last element with lcn >= LCN_HOLE. */
+	for (sfinal = send; sfinal >= 0 && srl[sfinal].lcn < LCN_HOLE; sfinal--)
+		;
+	for (dfinal = dend; dfinal >= 0 && drl[dfinal].lcn < LCN_HOLE; dfinal--)
+		;
+
+	{
+	BOOL start;
+	BOOL finish;
+	int ds = dend + 1;		/* Number of elements in drl & srl */
+	int ss = sfinal - sstart + 1;
+
+	start  = ((drl[dins].lcn <  LCN_RL_NOT_MAPPED) ||    /* End of file   */
+		  (drl[dins].vcn == srl[sstart].vcn));	     /* Start of hole */
+	finish = ((drl[dins].lcn >= LCN_RL_NOT_MAPPED) &&    /* End of file   */
+		 ((drl[dins].vcn + drl[dins].length) <=      /* End of hole   */
+		  (srl[send - 1].vcn + srl[send - 1].length)));
+
+	/* Or we'll lose an end marker */
+	if (start && finish && (drl[dins].length == 0))
+		ss++;
+	if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn))
+		finish = FALSE;
+#if 0
+	ntfs_debug("dfinal = %i, dend = %i", dfinal, dend);
+	ntfs_debug("sstart = %i, sfinal = %i, send = %i", sstart, sfinal, send);
+	ntfs_debug("start = %i, finish = %i", start, finish);
+	ntfs_debug("ds = %i, ss = %i, dins = %i", ds, ss, dins);
+#endif
+	if (start) {
+		if (finish)
+			drl = ntfs_rl_replace(drl, ds, srl + sstart, ss, dins);
+		else
+			drl = ntfs_rl_insert(drl, ds, srl + sstart, ss, dins);
+	} else {
+		if (finish)
+			drl = ntfs_rl_append(drl, ds, srl + sstart, ss, dins);
+		else
+			drl = ntfs_rl_split(drl, ds, srl + sstart, ss, dins);
+	}
+	if (IS_ERR(drl)) {
+		ntfs_error(NULL, "Merge failed.");
+		return drl;
+	}
+	ntfs_free(srl);
+	if (marker) {
+		ntfs_debug("Triggering marker code.");
+		for (ds = dend; drl[ds].length; ds++)
+			;
+		/* We only need to care if @srl ended after @drl. */
+		if (drl[ds].vcn <= marker_vcn) {
+			int slots = 0;
+
+			if (drl[ds].vcn == marker_vcn) {
+				ntfs_debug("Old marker = 0x%llx, replacing "
+						"with LCN_ENOENT.",
+						(unsigned long long)
+						drl[ds].lcn);
+				drl[ds].lcn = LCN_ENOENT;
+				goto finished;
+			}
+			/*
+			 * We need to create an unmapped runlist element in
+			 * @drl or extend an existing one before adding the
+			 * ENOENT terminator.
+			 */
+			if (drl[ds].lcn == LCN_ENOENT) {
+				ds--;
+				slots = 1;
+			}
+			if (drl[ds].lcn != LCN_RL_NOT_MAPPED) {
+				/* Add an unmapped runlist element. */
+				if (!slots) {
+					/* FIXME/TODO: We need to have the
+					 * extra memory already! (AIA) */
+					drl = ntfs_rl_realloc(drl, ds, ds + 2);
+					if (!drl)
+						goto critical_error;
+					slots = 2;
+				}
+				ds++;
+				/* Need to set vcn if it isn't set already. */
+				if (slots != 1)
+					drl[ds].vcn = drl[ds - 1].vcn +
+							drl[ds - 1].length;
+				drl[ds].lcn = LCN_RL_NOT_MAPPED;
+				/* We now used up a slot. */
+				slots--;
+			}
+			drl[ds].length = marker_vcn - drl[ds].vcn;
+			/* Finally add the ENOENT terminator. */
+			ds++;
+			if (!slots) {
+				/* FIXME/TODO: We need to have the extra
+				 * memory already! (AIA) */
+				drl = ntfs_rl_realloc(drl, ds, ds + 1);
+				if (!drl)
+					goto critical_error;
+			}
+			drl[ds].vcn = marker_vcn;
+			drl[ds].lcn = LCN_ENOENT;
+			drl[ds].length = (s64)0;
+		}
+	}
+	}
+
+finished:
+	/* The merge was completed successfully. */
+	ntfs_debug("Merged runlist:");
+	ntfs_debug_dump_runlist(drl);
+	return drl;
+
+critical_error:
+	/* Critical error! We cannot afford to fail here. */
+	ntfs_error(NULL, "Critical error! Not enough memory.");
+	panic("NTFS: Cannot continue.");
+}
+
+/**
+ * ntfs_mapping_pairs_decompress - convert mapping pairs array to runlist
+ * @vol:	ntfs volume on which the attribute resides
+ * @attr:	attribute record whose mapping pairs array to decompress
+ * @old_rl:	optional runlist in which to insert @attr's runlist
+ *
+ * It is up to the caller to serialize access to the runlist @old_rl.
+ *
+ * Decompress the attribute @attr's mapping pairs array into a runlist. On
+ * success, return the decompressed runlist.
+ *
+ * If @old_rl is not NULL, decompressed runlist is inserted into the
+ * appropriate place in @old_rl and the resultant, combined runlist is
+ * returned. The original @old_rl is deallocated.
+ *
+ * On error, return -errno. @old_rl is left unmodified in that case.
+ *
+ * The following error codes are defined:
+ *	-ENOMEM	- Not enough memory to allocate runlist array.
+ *	-EIO	- Corrupt runlist.
+ *	-EINVAL	- Invalid parameters were passed in.
+ *	-ERANGE	- The two runlists overlap.
+ *
+ * FIXME: For now we take the conceptionally simplest approach of creating the
+ * new runlist disregarding the already existing one and then splicing the
+ * two into one, if that is possible (we check for overlap and discard the new
+ * runlist if overlap present before returning ERR_PTR(-ERANGE)).
+ */
+runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
+		const ATTR_RECORD *attr, runlist_element *old_rl)
+{
+	VCN vcn;		/* Current vcn. */
+	LCN lcn;		/* Current lcn. */
+	s64 deltaxcn;		/* Change in [vl]cn. */
+	runlist_element *rl;	/* The output runlist. */
+	u8 *buf;		/* Current position in mapping pairs array. */
+	u8 *attr_end;		/* End of attribute. */
+	int rlsize;		/* Size of runlist buffer. */
+	u16 rlpos;		/* Current runlist position in units of
+				   runlist_elements. */
+	u8 b;			/* Current byte offset in buf. */
+
+#ifdef DEBUG
+	/* Make sure attr exists and is non-resident. */
+	if (!attr || !attr->non_resident || sle64_to_cpu(
+			attr->data.non_resident.lowest_vcn) < (VCN)0) {
+		ntfs_error(vol->sb, "Invalid arguments.");
+		return ERR_PTR(-EINVAL);
+	}
+#endif
+	/* Start at vcn = lowest_vcn and lcn 0. */
+	vcn = sle64_to_cpu(attr->data.non_resident.lowest_vcn);
+	lcn = 0;
+	/* Get start of the mapping pairs array. */
+	buf = (u8*)attr + le16_to_cpu(
+			attr->data.non_resident.mapping_pairs_offset);
+	attr_end = (u8*)attr + le32_to_cpu(attr->length);
+	if (unlikely(buf < (u8*)attr || buf > attr_end)) {
+		ntfs_error(vol->sb, "Corrupt attribute.");
+		return ERR_PTR(-EIO);
+	}
+	/* Current position in runlist array. */
+	rlpos = 0;
+	/* Allocate first page and set current runlist size to one page. */
+	rl = ntfs_malloc_nofs(rlsize = PAGE_SIZE);
+	if (unlikely(!rl))
+		return ERR_PTR(-ENOMEM);
+	/* Insert unmapped starting element if necessary. */
+	if (vcn) {
+		rl->vcn = 0;
+		rl->lcn = LCN_RL_NOT_MAPPED;
+		rl->length = vcn;
+		rlpos++;
+	}
+	while (buf < attr_end && *buf) {
+		/*
+		 * Allocate more memory if needed, including space for the
+		 * not-mapped and terminator elements. ntfs_malloc_nofs()
+		 * operates on whole pages only.
+		 */
+		if (((rlpos + 3) * sizeof(*old_rl)) > rlsize) {
+			runlist_element *rl2;
+
+			rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE);
+			if (unlikely(!rl2)) {
+				ntfs_free(rl);
+				return ERR_PTR(-ENOMEM);
+			}
+			memcpy(rl2, rl, rlsize);
+			ntfs_free(rl);
+			rl = rl2;
+			rlsize += PAGE_SIZE;
+		}
+		/* Enter the current vcn into the current runlist element. */
+		rl[rlpos].vcn = vcn;
+		/*
+		 * Get the change in vcn, i.e. the run length in clusters.
+		 * Doing it this way ensures that we signextend negative values.
+		 * A negative run length doesn't make any sense, but hey, I
+		 * didn't make up the NTFS specs and Windows NT4 treats the run
+		 * length as a signed value so that's how it is...
+		 */
+		b = *buf & 0xf;
+		if (b) {
+			if (unlikely(buf + b > attr_end))
+				goto io_error;
+			for (deltaxcn = (s8)buf[b--]; b; b--)
+				deltaxcn = (deltaxcn << 8) + buf[b];
+		} else { /* The length entry is compulsory. */
+			ntfs_error(vol->sb, "Missing length entry in mapping "
+					"pairs array.");
+			deltaxcn = (s64)-1;
+		}
+		/*
+		 * Assume a negative length to indicate data corruption and
+		 * hence clean-up and return NULL.
+		 */
+		if (unlikely(deltaxcn < 0)) {
+			ntfs_error(vol->sb, "Invalid length in mapping pairs "
+					"array.");
+			goto err_out;
+		}
+		/*
+		 * Enter the current run length into the current runlist
+		 * element.
+		 */
+		rl[rlpos].length = deltaxcn;
+		/* Increment the current vcn by the current run length. */
+		vcn += deltaxcn;
+		/*
+		 * There might be no lcn change at all, as is the case for
+		 * sparse clusters on NTFS 3.0+, in which case we set the lcn
+		 * to LCN_HOLE.
+		 */
+		if (!(*buf & 0xf0))
+			rl[rlpos].lcn = LCN_HOLE;
+		else {
+			/* Get the lcn change which really can be negative. */
+			u8 b2 = *buf & 0xf;
+			b = b2 + ((*buf >> 4) & 0xf);
+			if (buf + b > attr_end)
+				goto io_error;
+			for (deltaxcn = (s8)buf[b--]; b > b2; b--)
+				deltaxcn = (deltaxcn << 8) + buf[b];
+			/* Change the current lcn to its new value. */
+			lcn += deltaxcn;
+#ifdef DEBUG
+			/*
+			 * On NTFS 1.2-, apparently can have lcn == -1 to
+			 * indicate a hole. But we haven't verified ourselves
+			 * whether it is really the lcn or the deltaxcn that is
+			 * -1. So if either is found give us a message so we
+			 * can investigate it further!
+			 */
+			if (vol->major_ver < 3) {
+				if (unlikely(deltaxcn == (LCN)-1))
+					ntfs_error(vol->sb, "lcn delta == -1");
+				if (unlikely(lcn == (LCN)-1))
+					ntfs_error(vol->sb, "lcn == -1");
+			}
+#endif
+			/* Check lcn is not below -1. */
+			if (unlikely(lcn < (LCN)-1)) {
+				ntfs_error(vol->sb, "Invalid LCN < -1 in "
+						"mapping pairs array.");
+				goto err_out;
+			}
+			/* Enter the current lcn into the runlist element. */
+			rl[rlpos].lcn = lcn;
+		}
+		/* Get to the next runlist element. */
+		rlpos++;
+		/* Increment the buffer position to the next mapping pair. */
+		buf += (*buf & 0xf) + ((*buf >> 4) & 0xf) + 1;
+	}
+	if (unlikely(buf >= attr_end))
+		goto io_error;
+	/*
+	 * If there is a highest_vcn specified, it must be equal to the final
+	 * vcn in the runlist - 1, or something has gone badly wrong.
+	 */
+	deltaxcn = sle64_to_cpu(attr->data.non_resident.highest_vcn);
+	if (unlikely(deltaxcn && vcn - 1 != deltaxcn)) {
+mpa_err:
+		ntfs_error(vol->sb, "Corrupt mapping pairs array in "
+				"non-resident attribute.");
+		goto err_out;
+	}
+	/* Setup not mapped runlist element if this is the base extent. */
+	if (!attr->data.non_resident.lowest_vcn) {
+		VCN max_cluster;
+
+		max_cluster = (sle64_to_cpu(
+				attr->data.non_resident.allocated_size) +
+				vol->cluster_size - 1) >>
+				vol->cluster_size_bits;
+		/*
+		 * If there is a difference between the highest_vcn and the
+		 * highest cluster, the runlist is either corrupt or, more
+		 * likely, there are more extents following this one.
+		 */
+		if (deltaxcn < --max_cluster) {
+			ntfs_debug("More extents to follow; deltaxcn = 0x%llx, "
+					"max_cluster = 0x%llx",
+					(unsigned long long)deltaxcn,
+					(unsigned long long)max_cluster);
+			rl[rlpos].vcn = vcn;
+			vcn += rl[rlpos].length = max_cluster - deltaxcn;
+			rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
+			rlpos++;
+		} else if (unlikely(deltaxcn > max_cluster)) {
+			ntfs_error(vol->sb, "Corrupt attribute. deltaxcn = "
+					"0x%llx, max_cluster = 0x%llx",
+					(unsigned long long)deltaxcn,
+					(unsigned long long)max_cluster);
+			goto mpa_err;
+		}
+		rl[rlpos].lcn = LCN_ENOENT;
+	} else /* Not the base extent. There may be more extents to follow. */
+		rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
+
+	/* Setup terminating runlist element. */
+	rl[rlpos].vcn = vcn;
+	rl[rlpos].length = (s64)0;
+	/* If no existing runlist was specified, we are done. */
+	if (!old_rl) {
+		ntfs_debug("Mapping pairs array successfully decompressed:");
+		ntfs_debug_dump_runlist(rl);
+		return rl;
+	}
+	/* Now combine the new and old runlists checking for overlaps. */
+	old_rl = ntfs_runlists_merge(old_rl, rl);
+	if (likely(!IS_ERR(old_rl)))
+		return old_rl;
+	ntfs_free(rl);
+	ntfs_error(vol->sb, "Failed to merge runlists.");
+	return old_rl;
+io_error:
+	ntfs_error(vol->sb, "Corrupt attribute.");
+err_out:
+	ntfs_free(rl);
+	return ERR_PTR(-EIO);
+}
+
+/**
+ * ntfs_rl_vcn_to_lcn - convert a vcn into a lcn given a runlist
+ * @rl:		runlist to use for conversion
+ * @vcn:	vcn to convert
+ *
+ * Convert the virtual cluster number @vcn of an attribute into a logical
+ * cluster number (lcn) of a device using the runlist @rl to map vcns to their
+ * corresponding lcns.
+ *
+ * It is up to the caller to serialize access to the runlist @rl.
+ *
+ * Since lcns must be >= 0, we use negative return values with special meaning:
+ *
+ * Return value			Meaning / Description
+ * ==================================================
+ *  -1 = LCN_HOLE		Hole / not allocated on disk.
+ *  -2 = LCN_RL_NOT_MAPPED	This is part of the runlist which has not been
+ *				inserted into the runlist yet.
+ *  -3 = LCN_ENOENT		There is no such vcn in the attribute.
+ *
+ * Locking: - The caller must have locked the runlist (for reading or writing).
+ *	    - This function does not touch the lock.
+ */
+LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn)
+{
+	int i;
+
+	BUG_ON(vcn < 0);
+	/*
+	 * If rl is NULL, assume that we have found an unmapped runlist. The
+	 * caller can then attempt to map it and fail appropriately if
+	 * necessary.
+	 */
+	if (unlikely(!rl))
+		return LCN_RL_NOT_MAPPED;
+
+	/* Catch out of lower bounds vcn. */
+	if (unlikely(vcn < rl[0].vcn))
+		return LCN_ENOENT;
+
+	for (i = 0; likely(rl[i].length); i++) {
+		if (unlikely(vcn < rl[i+1].vcn)) {
+			if (likely(rl[i].lcn >= (LCN)0))
+				return rl[i].lcn + (vcn - rl[i].vcn);
+			return rl[i].lcn;
+		}
+	}
+	/*
+	 * The terminator element is setup to the correct value, i.e. one of
+	 * LCN_HOLE, LCN_RL_NOT_MAPPED, or LCN_ENOENT.
+	 */
+	if (likely(rl[i].lcn < (LCN)0))
+		return rl[i].lcn;
+	/* Just in case... We could replace this with BUG() some day. */
+	return LCN_ENOENT;
+}
+
+/**
+ * ntfs_get_nr_significant_bytes - get number of bytes needed to store a number
+ * @n:		number for which to get the number of bytes for
+ *
+ * Return the number of bytes required to store @n unambiguously as
+ * a signed number.
+ *
+ * This is used in the context of the mapping pairs array to determine how
+ * many bytes will be needed in the array to store a given logical cluster
+ * number (lcn) or a specific run length.
+ *
+ * Return the number of bytes written.  This function cannot fail.
+ */
+static inline int ntfs_get_nr_significant_bytes(const s64 n)
+{
+	s64 l = n;
+	int i;
+	s8 j;
+
+	i = 0;
+	do {
+		l >>= 8;
+		i++;
+	} while (l != 0 && l != -1);
+	j = (n >> 8 * (i - 1)) & 0xff;
+	/* If the sign bit is wrong, we need an extra byte. */
+	if ((n < 0 && j >= 0) || (n > 0 && j < 0))
+		i++;
+	return i;
+}
+
+/**
+ * ntfs_get_size_for_mapping_pairs - get bytes needed for mapping pairs array
+ * @vol:	ntfs volume (needed for the ntfs version)
+ * @rl:		locked runlist to determine the size of the mapping pairs of
+ * @start_vcn:	vcn at which to start the mapping pairs array
+ *
+ * Walk the locked runlist @rl and calculate the size in bytes of the mapping
+ * pairs array corresponding to the runlist @rl, starting at vcn @start_vcn.
+ * This for example allows us to allocate a buffer of the right size when
+ * building the mapping pairs array.
+ *
+ * If @rl is NULL, just return 1 (for the single terminator byte).
+ *
+ * Return the calculated size in bytes on success.  On error, return -errno.
+ * The following error codes are defined:
+ *	-EINVAL	- Run list contains unmapped elements.  Make sure to only pass
+ *		  fully mapped runlists to this function.
+ *	-EIO	- The runlist is corrupt.
+ *
+ * Locking: @rl must be locked on entry (either for reading or writing), it
+ *	    remains locked throughout, and is left locked upon return.
+ */
+int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
+		const runlist_element *rl, const VCN start_vcn)
+{
+	LCN prev_lcn;
+	int rls;
+
+	BUG_ON(start_vcn < 0);
+	if (!rl) {
+		BUG_ON(start_vcn);
+		return 1;
+	}
+	/* Skip to runlist element containing @start_vcn. */
+	while (rl->length && start_vcn >= rl[1].vcn)
+		rl++;
+	if ((!rl->length && start_vcn > rl->vcn) || start_vcn < rl->vcn)
+		return -EINVAL;
+	prev_lcn = 0;
+	/* Always need the termining zero byte. */
+	rls = 1;
+	/* Do the first partial run if present. */
+	if (start_vcn > rl->vcn) {
+		s64 delta;
+
+		/* We know rl->length != 0 already. */
+		if (rl->length < 0 || rl->lcn < LCN_HOLE)
+			goto err_out;
+		delta = start_vcn - rl->vcn;
+		/* Header byte + length. */
+		rls += 1 + ntfs_get_nr_significant_bytes(rl->length - delta);
+		/*
+		 * If the logical cluster number (lcn) denotes a hole and we
+		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
+		 * zero space.  On earlier NTFS versions we just store the lcn.
+		 * Note: this assumes that on NTFS 1.2-, holes are stored with
+		 * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
+		 */
+		if (rl->lcn >= 0 || vol->major_ver < 3) {
+			prev_lcn = rl->lcn;
+			if (rl->lcn >= 0)
+				prev_lcn += delta;
+			/* Change in lcn. */
+			rls += ntfs_get_nr_significant_bytes(prev_lcn);
+		}
+		/* Go to next runlist element. */
+		rl++;
+	}
+	/* Do the full runs. */
+	for (; rl->length; rl++) {
+		if (rl->length < 0 || rl->lcn < LCN_HOLE)
+			goto err_out;
+		/* Header byte + length. */
+		rls += 1 + ntfs_get_nr_significant_bytes(rl->length);
+		/*
+		 * If the logical cluster number (lcn) denotes a hole and we
+		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
+		 * zero space.  On earlier NTFS versions we just store the lcn.
+		 * Note: this assumes that on NTFS 1.2-, holes are stored with
+		 * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
+		 */
+		if (rl->lcn >= 0 || vol->major_ver < 3) {
+			/* Change in lcn. */
+			rls += ntfs_get_nr_significant_bytes(rl->lcn -
+					prev_lcn);
+			prev_lcn = rl->lcn;
+		}
+	}
+	return rls;
+err_out:
+	if (rl->lcn == LCN_RL_NOT_MAPPED)
+		rls = -EINVAL;
+	else
+		rls = -EIO;
+	return rls;
+}
+
+/**
+ * ntfs_write_significant_bytes - write the significant bytes of a number
+ * @dst:	destination buffer to write to
+ * @dst_max:	pointer to last byte of destination buffer for bounds checking
+ * @n:		number whose significant bytes to write
+ *
+ * Store in @dst, the minimum bytes of the number @n which are required to
+ * identify @n unambiguously as a signed number, taking care not to exceed
+ * @dest_max, the maximum position within @dst to which we are allowed to
+ * write.
+ *
+ * This is used when building the mapping pairs array of a runlist to compress
+ * a given logical cluster number (lcn) or a specific run length to the minumum
+ * size possible.
+ *
+ * Return the number of bytes written on success.  On error, i.e. the
+ * destination buffer @dst is too small, return -ENOSPC.
+ */
+static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max,
+		const s64 n)
+{
+	s64 l = n;
+	int i;
+	s8 j;
+
+	i = 0;
+	do {
+		if (dst > dst_max)
+			goto err_out;
+		*dst++ = l & 0xffll;
+		l >>= 8;
+		i++;
+	} while (l != 0 && l != -1);
+	j = (n >> 8 * (i - 1)) & 0xff;
+	/* If the sign bit is wrong, we need an extra byte. */
+	if (n < 0 && j >= 0) {
+		if (dst > dst_max)
+			goto err_out;
+		i++;
+		*dst = (s8)-1;
+	} else if (n > 0 && j < 0) {
+		if (dst > dst_max)
+			goto err_out;
+		i++;
+		*dst = (s8)0;
+	}
+	return i;
+err_out:
+	return -ENOSPC;
+}
+
+/**
+ * ntfs_mapping_pairs_build - build the mapping pairs array from a runlist
+ * @vol:	ntfs volume (needed for the ntfs version)
+ * @dst:	destination buffer to which to write the mapping pairs array
+ * @dst_len:	size of destination buffer @dst in bytes
+ * @rl:		locked runlist for which to build the mapping pairs array
+ * @start_vcn:	vcn at which to start the mapping pairs array
+ * @stop_vcn:	first vcn outside destination buffer on success or -ENOSPC
+ *
+ * Create the mapping pairs array from the locked runlist @rl, starting at vcn
+ * @start_vcn and save the array in @dst.  @dst_len is the size of @dst in
+ * bytes and it should be at least equal to the value obtained by calling
+ * ntfs_get_size_for_mapping_pairs().
+ *
+ * If @rl is NULL, just write a single terminator byte to @dst.
+ *
+ * On success or -ENOSPC error, if @stop_vcn is not NULL, *@stop_vcn is set to
+ * the first vcn outside the destination buffer.  Note that on error, @dst has
+ * been filled with all the mapping pairs that will fit, thus it can be treated
+ * as partial success, in that a new attribute extent needs to be created or
+ * the next extent has to be used and the mapping pairs build has to be
+ * continued with @start_vcn set to *@stop_vcn.
+ *
+ * Return 0 on success and -errno on error.  The following error codes are
+ * defined:
+ *	-EINVAL	- Run list contains unmapped elements.  Make sure to only pass
+ *		  fully mapped runlists to this function.
+ *	-EIO	- The runlist is corrupt.
+ *	-ENOSPC	- The destination buffer is too small.
+ *
+ * Locking: @rl must be locked on entry (either for reading or writing), it
+ *	    remains locked throughout, and is left locked upon return.
+ */
+int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
+		const int dst_len, const runlist_element *rl,
+		const VCN start_vcn, VCN *const stop_vcn)
+{
+	LCN prev_lcn;
+	s8 *dst_max, *dst_next;
+	int err = -ENOSPC;
+	s8 len_len, lcn_len;
+
+	BUG_ON(start_vcn < 0);
+	BUG_ON(dst_len < 1);
+	if (!rl) {
+		BUG_ON(start_vcn);
+		if (stop_vcn)
+			*stop_vcn = 0;
+		/* Terminator byte. */
+		*dst = 0;
+		return 0;
+	}
+	/* Skip to runlist element containing @start_vcn. */
+	while (rl->length && start_vcn >= rl[1].vcn)
+		rl++;
+	if ((!rl->length && start_vcn > rl->vcn) || start_vcn < rl->vcn)
+		return -EINVAL;
+	/*
+	 * @dst_max is used for bounds checking in
+	 * ntfs_write_significant_bytes().
+	 */
+	dst_max = dst + dst_len - 1;
+	prev_lcn = 0;
+	/* Do the first partial run if present. */
+	if (start_vcn > rl->vcn) {
+		s64 delta;
+
+		/* We know rl->length != 0 already. */
+		if (rl->length < 0 || rl->lcn < LCN_HOLE)
+			goto err_out;
+		delta = start_vcn - rl->vcn;
+		/* Write length. */
+		len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
+				rl->length - delta);
+		if (len_len < 0)
+			goto size_err;
+		/*
+		 * If the logical cluster number (lcn) denotes a hole and we
+		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
+		 * zero space.  On earlier NTFS versions we just write the lcn
+		 * change.  FIXME: Do we need to write the lcn change or just
+		 * the lcn in that case?  Not sure as I have never seen this
+		 * case on NT4. - We assume that we just need to write the lcn
+		 * change until someone tells us otherwise... (AIA)
+		 */
+		if (rl->lcn >= 0 || vol->major_ver < 3) {
+			prev_lcn = rl->lcn;
+			if (rl->lcn >= 0)
+				prev_lcn += delta;
+			/* Write change in lcn. */
+			lcn_len = ntfs_write_significant_bytes(dst + 1 +
+					len_len, dst_max, prev_lcn);
+			if (lcn_len < 0)
+				goto size_err;
+		} else
+			lcn_len = 0;
+		dst_next = dst + len_len + lcn_len + 1;
+		if (dst_next > dst_max)
+			goto size_err;
+		/* Update header byte. */
+		*dst = lcn_len << 4 | len_len;
+		/* Position at next mapping pairs array element. */
+		dst = dst_next;
+		/* Go to next runlist element. */
+		rl++;
+	}
+	/* Do the full runs. */
+	for (; rl->length; rl++) {
+		if (rl->length < 0 || rl->lcn < LCN_HOLE)
+			goto err_out;
+		/* Write length. */
+		len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
+				rl->length);
+		if (len_len < 0)
+			goto size_err;
+		/*
+		 * If the logical cluster number (lcn) denotes a hole and we
+		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
+		 * zero space.  On earlier NTFS versions we just write the lcn
+		 * change.  FIXME: Do we need to write the lcn change or just
+		 * the lcn in that case?  Not sure as I have never seen this
+		 * case on NT4. - We assume that we just need to write the lcn
+		 * change until someone tells us otherwise... (AIA)
+		 */
+		if (rl->lcn >= 0 || vol->major_ver < 3) {
+			/* Write change in lcn. */
+			lcn_len = ntfs_write_significant_bytes(dst + 1 +
+					len_len, dst_max, rl->lcn - prev_lcn);
+			if (lcn_len < 0)
+				goto size_err;
+			prev_lcn = rl->lcn;
+		} else
+			lcn_len = 0;
+		dst_next = dst + len_len + lcn_len + 1;
+		if (dst_next > dst_max)
+			goto size_err;
+		/* Update header byte. */
+		*dst = lcn_len << 4 | len_len;
+		/* Position at next mapping pairs array element. */
+		dst = dst_next;
+	}
+	/* Success. */
+	err = 0;
+size_err:
+	/* Set stop vcn. */
+	if (stop_vcn)
+		*stop_vcn = rl->vcn;
+	/* Add terminator byte. */
+	*dst = 0;
+	return err;
+err_out:
+	if (rl->lcn == LCN_RL_NOT_MAPPED)
+		err = -EINVAL;
+	else
+		err = -EIO;
+	return err;
+}
+
+/**
+ * ntfs_rl_truncate_nolock - truncate a runlist starting at a specified vcn
+ * @runlist:	runlist to truncate
+ * @new_length:	the new length of the runlist in VCNs
+ *
+ * Truncate the runlist described by @runlist as well as the memory buffer
+ * holding the runlist elements to a length of @new_length VCNs.
+ *
+ * If @new_length lies within the runlist, the runlist elements with VCNs of
+ * @new_length and above are discarded.
+ *
+ * If @new_length lies beyond the runlist, a sparse runlist element is added to
+ * the end of the runlist @runlist or if the last runlist element is a sparse
+ * one already, this is extended.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: The caller must hold @runlist->lock for writing.
+ */
+int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist,
+		const s64 new_length)
+{
+	runlist_element *rl;
+	int old_size;
+
+	ntfs_debug("Entering for new_length 0x%llx.", (long long)new_length);
+	BUG_ON(!runlist);
+	BUG_ON(new_length < 0);
+	rl = runlist->rl;
+	if (unlikely(!rl)) {
+		/*
+		 * Create a runlist consisting of a sparse runlist element of
+		 * length @new_length followed by a terminator runlist element.
+		 */
+		rl = ntfs_malloc_nofs(PAGE_SIZE);
+		if (unlikely(!rl)) {
+			ntfs_error(vol->sb, "Not enough memory to allocate "
+					"runlist element buffer.");
+			return -ENOMEM;
+		}
+		runlist->rl = rl;
+		rl[1].length = rl->vcn = 0;
+		rl->lcn = LCN_HOLE;
+		rl[1].vcn = rl->length = new_length;
+		rl[1].lcn = LCN_ENOENT;
+		return 0;
+	}
+	BUG_ON(new_length < rl->vcn);
+	/* Find @new_length in the runlist. */
+	while (likely(rl->length && new_length >= rl[1].vcn))
+		rl++;
+	/*
+	 * If not at the end of the runlist we need to shrink it.
+	 * If at the end of the runlist we need to expand it.
+	 */
+	if (rl->length) {
+		runlist_element *trl;
+		BOOL is_end;
+
+		ntfs_debug("Shrinking runlist.");
+		/* Determine the runlist size. */
+		trl = rl + 1;
+		while (likely(trl->length))
+			trl++;
+		old_size = trl - runlist->rl + 1;
+		/* Truncate the run. */
+		rl->length = new_length - rl->vcn;
+		/*
+		 * If a run was partially truncated, make the following runlist
+		 * element a terminator.
+		 */
+		is_end = FALSE;
+		if (rl->length) {
+			rl++;
+			if (!rl->length)
+				is_end = TRUE;
+			rl->vcn = new_length;
+			rl->length = 0;
+		}
+		rl->lcn = LCN_ENOENT;
+		/* Reallocate memory if necessary. */
+		if (!is_end) {
+			int new_size = rl - runlist->rl + 1;
+			rl = ntfs_rl_realloc(runlist->rl, old_size, new_size);
+			if (IS_ERR(rl))
+				ntfs_warning(vol->sb, "Failed to shrink "
+						"runlist buffer.  This just "
+						"wastes a bit of memory "
+						"temporarily so we ignore it "
+						"and return success.");
+			else
+				runlist->rl = rl;
+		}
+	} else if (likely(/* !rl->length && */ new_length > rl->vcn)) {
+		ntfs_debug("Expanding runlist.");
+		/*
+		 * If there is a previous runlist element and it is a sparse
+		 * one, extend it.  Otherwise need to add a new, sparse runlist
+		 * element.
+		 */
+		if ((rl > runlist->rl) && ((rl - 1)->lcn == LCN_HOLE))
+			(rl - 1)->length = new_length - (rl - 1)->vcn;
+		else {
+			/* Determine the runlist size. */
+			old_size = rl - runlist->rl + 1;
+			/* Reallocate memory if necessary. */
+			rl = ntfs_rl_realloc(runlist->rl, old_size,
+					old_size + 1);
+			if (IS_ERR(rl)) {
+				ntfs_error(vol->sb, "Failed to expand runlist "
+						"buffer, aborting.");
+				return PTR_ERR(rl);
+			}
+			runlist->rl = rl;
+			/*
+			 * Set @rl to the same runlist element in the new
+			 * runlist as before in the old runlist.
+			 */
+			rl += old_size - 1;
+			/* Add a new, sparse runlist element. */
+			rl->lcn = LCN_HOLE;
+			rl->length = new_length - rl->vcn;
+			/* Add a new terminator runlist element. */
+			rl++;
+			rl->length = 0;
+		}
+		rl->vcn = new_length;
+		rl->lcn = LCN_ENOENT;
+	} else /* if (unlikely(!rl->length && new_length == rl->vcn)) */ {
+		/* Runlist already has same size as requested. */
+		rl->lcn = LCN_ENOENT;
+	}
+	ntfs_debug("Done.");
+	return 0;
+}
diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h
new file mode 100644
index 000000000000..7107fde59df9
--- /dev/null
+++ b/fs/ntfs/runlist.h
@@ -0,0 +1,89 @@
+/*
+ * runlist.h - Defines for runlist handling in NTFS Linux kernel driver.
+ *	       Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ *
+ * This program/include file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program/include file is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS
+ * distribution in the file COPYING); if not, write to the Free Software
+ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _LINUX_NTFS_RUNLIST_H
+#define _LINUX_NTFS_RUNLIST_H
+
+#include "types.h"
+#include "layout.h"
+#include "volume.h"
+
+/**
+ * runlist_element - in memory vcn to lcn mapping array element
+ * @vcn:	starting vcn of the current array element
+ * @lcn:	starting lcn of the current array element
+ * @length:	length in clusters of the current array element
+ *
+ * The last vcn (in fact the last vcn + 1) is reached when length == 0.
+ *
+ * When lcn == -1 this means that the count vcns starting at vcn are not
+ * physically allocated (i.e. this is a hole / data is sparse).
+ */
+typedef struct {	/* In memory vcn to lcn mapping structure element. */
+	VCN vcn;	/* vcn = Starting virtual cluster number. */
+	LCN lcn;	/* lcn = Starting logical cluster number. */
+	s64 length;	/* Run length in clusters. */
+} runlist_element;
+
+/**
+ * runlist - in memory vcn to lcn mapping array including a read/write lock
+ * @rl:		pointer to an array of runlist elements
+ * @lock:	read/write spinlock for serializing access to @rl
+ *
+ */
+typedef struct {
+	runlist_element *rl;
+	struct rw_semaphore lock;
+} runlist;
+
+static inline void ntfs_init_runlist(runlist *rl)
+{
+	rl->rl = NULL;
+	init_rwsem(&rl->lock);
+}
+
+typedef enum {
+	LCN_HOLE		= -1,	/* Keep this as highest value or die! */
+	LCN_RL_NOT_MAPPED	= -2,
+	LCN_ENOENT		= -3,
+} LCN_SPECIAL_VALUES;
+
+extern runlist_element *ntfs_runlists_merge(runlist_element *drl,
+		runlist_element *srl);
+
+extern runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
+		const ATTR_RECORD *attr, runlist_element *old_rl);
+
+extern LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn);
+
+extern int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
+		const runlist_element *rl, const VCN start_vcn);
+
+extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
+		const int dst_len, const runlist_element *rl,
+		const VCN start_vcn, VCN *const stop_vcn);
+
+extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol,
+		runlist *const runlist, const s64 new_length);
+
+#endif /* _LINUX_NTFS_RUNLIST_H */
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 2c93c59186c7..d915ee20873f 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -31,12 +31,15 @@
 #include <linux/moduleparam.h>
 #include <linux/smp_lock.h>
 
-#include "ntfs.h"
 #include "sysctl.h"
 #include "logfile.h"
 #include "quota.h"
 #include "dir.h"
+#include "debug.h"
 #include "index.h"
+#include "aops.h"
+#include "malloc.h"
+#include "ntfs.h"
 
 /* Number of mounted file systems which have compression enabled. */
 static unsigned long ntfs_nr_compression_users;
@@ -827,12 +830,12 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 }
 
 /**
- * setup_lcn_allocator - initialize the cluster allocator
- * @vol:	volume structure for which to setup the lcn allocator
+ * ntfs_setup_allocators - initialize the cluster and mft allocators
+ * @vol:	volume structure for which to setup the allocators
  *
- * Setup the cluster (lcn) allocator to the starting values.
+ * Setup the cluster (lcn) and mft allocators to the starting values.
  */
-static void setup_lcn_allocator(ntfs_volume *vol)
+static void ntfs_setup_allocators(ntfs_volume *vol)
 {
 #ifdef NTFS_RW
 	LCN mft_zone_size, mft_lcn;
@@ -902,6 +905,11 @@ static void setup_lcn_allocator(ntfs_volume *vol)
 	vol->data2_zone_pos = 0;
 	ntfs_debug("vol->data2_zone_pos = 0x%llx",
 			(unsigned long long)vol->data2_zone_pos);
+
+	/* Set the mft data allocation position to mft record 24. */
+	vol->mft_data_pos = 24;
+	ntfs_debug("vol->mft_data_pos = 0x%llx",
+			(unsigned long long)vol->mft_data_pos);
 #endif /* NTFS_RW */
 }
 
@@ -938,8 +946,8 @@ static BOOL load_and_init_mft_mirror(ntfs_volume *vol)
 	/* No VFS initiated operations allowed for $MFTMirr. */
 	tmp_ino->i_op = &ntfs_empty_inode_ops;
 	tmp_ino->i_fop = &ntfs_empty_file_ops;
-	/* Put back our special address space operations. */
-	tmp_ino->i_mapping->a_ops = &ntfs_mft_aops;
+	/* Put in our special address space operations. */
+	tmp_ino->i_mapping->a_ops = &ntfs_mst_aops;
 	tmp_ni = NTFS_I(tmp_ino);
 	/* The $MFTMirr, like the $MFT is multi sector transfer protected. */
 	NInoSetMstProtected(tmp_ni);
@@ -2334,8 +2342,8 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 	 */
 	result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data);
 
-	/* Initialize the cluster allocator. */
-	setup_lcn_allocator(vol);
+	/* Initialize the cluster and mft allocators. */
+	ntfs_setup_allocators(vol);
 
 	brelse(bh);
 
diff --git a/fs/ntfs/types.h b/fs/ntfs/types.h
index a98731ece42e..08a55aa53d4e 100644
--- a/fs/ntfs/types.h
+++ b/fs/ntfs/types.h
@@ -53,34 +53,6 @@ typedef sle64 leLCN;
 typedef s64 LSN;
 typedef sle64 leLSN;
 
-/**
- * runlist_element - in memory vcn to lcn mapping array element
- * @vcn:	starting vcn of the current array element
- * @lcn:	starting lcn of the current array element
- * @length:	length in clusters of the current array element
- *
- * The last vcn (in fact the last vcn + 1) is reached when length == 0.
- *
- * When lcn == -1 this means that the count vcns starting at vcn are not
- * physically allocated (i.e. this is a hole / data is sparse).
- */
-typedef struct {	/* In memory vcn to lcn mapping structure element. */
-	VCN vcn;	/* vcn = Starting virtual cluster number. */
-	LCN lcn;	/* lcn = Starting logical cluster number. */
-	s64 length;	/* Run length in clusters. */
-} runlist_element;
-
-/**
- * runlist - in memory vcn to lcn mapping array including a read/write lock
- * @rl:		pointer to an array of runlist elements
- * @lock:	read/write spinlock for serializing access to @rl
- *
- */
-typedef struct {
-	runlist_element *rl;
-	struct rw_semaphore lock;
-} runlist;
-
 typedef enum {
 	FALSE = 0,
 	TRUE = 1
diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c
index ec7405a80b4c..560b0ea255b0 100644
--- a/fs/ntfs/unistr.c
+++ b/fs/ntfs/unistr.c
@@ -19,6 +19,8 @@
  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include "types.h"
+#include "debug.h"
 #include "ntfs.h"
 
 /*
diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c
index 276ed97982d3..879cdf1d5bd3 100644
--- a/fs/ntfs/upcase.c
+++ b/fs/ntfs/upcase.c
@@ -24,6 +24,7 @@
  * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include "malloc.h"
 #include "ntfs.h"
 
 ntfschar *generate_default_upcase(void)
diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h
index 9775fa65895e..4b97fa8635a8 100644
--- a/fs/ntfs/volume.h
+++ b/fs/ntfs/volume.h
@@ -24,6 +24,8 @@
 #ifndef _LINUX_NTFS_VOLUME_H
 #define _LINUX_NTFS_VOLUME_H
 
+#include <linux/rwsem.h>
+
 #include "types.h"
 #include "layout.h"
 
@@ -81,6 +83,8 @@ typedef struct {
 
 #ifdef NTFS_RW
 	/* Variables used by the cluster and mft allocators. */
+	s64 mft_data_pos;		/* Mft record number at which to
+					   allocate the next mft record. */
 	LCN mft_zone_start;		/* First cluster of the mft zone. */
 	LCN mft_zone_end;		/* First cluster beyond the mft zone. */
 	LCN mft_zone_pos;		/* Current position in the mft zone. */
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index c802d5a2f16a..97fbb86195ef 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -29,7 +29,6 @@ EXPORT_SYMBOL(posix_acl_equiv_mode);
 EXPORT_SYMBOL(posix_acl_from_mode);
 EXPORT_SYMBOL(posix_acl_create_masq);
 EXPORT_SYMBOL(posix_acl_chmod_masq);
-EXPORT_SYMBOL(posix_acl_masq_nfs_mode);
 EXPORT_SYMBOL(posix_acl_permission);
 
 /*
@@ -380,44 +379,3 @@ posix_acl_chmod_masq(struct posix_acl *acl, mode_t mode)
 
 	return 0;
 }
-
-/*
- * Adjust the mode parameter so that NFSv2 grants nobody permissions
- * that may not be granted by the ACL. This is necessary because NFSv2
- * may compute access permissions on the client side, and may serve cached
- * data whenever it assumes access would be granted.  Since ACLs may also
- * be used to deny access to specific users, the minimal permissions
- * for secure operation over NFSv2 are very restrictive. Permissions
- * granted to users via Access Control Lists will not be effective over
- * NFSv2.
- *
- * Privilege escalation can only happen for read operations, as writes are
- * always carried out on the NFS server, where the proper access checks are
- * implemented.
- */
-int
-posix_acl_masq_nfs_mode(struct posix_acl *acl, mode_t *mode_p)
-{
-	struct posix_acl_entry *pa, *pe; int min_perm = S_IRWXO;
-
-	FOREACH_ACL_ENTRY(pa, acl, pe) {
-                switch(pa->e_tag) {
-			case ACL_USER_OBJ:
-				break;
-
-			case ACL_USER:
-			case ACL_GROUP_OBJ:
-			case ACL_GROUP:
-			case ACL_MASK:
-			case ACL_OTHER:
-				min_perm &= pa->e_perm;
-				break;
-
-			default:
-				return -EIO;
-		}
-	}
-	*mode_p = (*mode_p & ~(S_IRWXG|S_IRWXO)) | (min_perm << 3) | min_perm;
-
-	return 0;
-}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 987240398c4a..91060b9921cc 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -473,7 +473,7 @@ out:
 
 static int proc_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
-	if (vfs_permission(inode, mask) != 0)
+	if (generic_permission(inode, mask, NULL) != 0)
 		return -EACCES;
 	return proc_check_root(inode);
 }
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 6151f0592f28..76779d4e5a75 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -149,9 +149,6 @@ struct proc_dir_entry proc_root = {
 	.parent		= &proc_root,
 };
 
-#ifdef CONFIG_SYSCTL
-EXPORT_SYMBOL(proc_sys_root);
-#endif
 EXPORT_SYMBOL(proc_symlink);
 EXPORT_SYMBOL(proc_mkdir);
 EXPORT_SYMBOL(create_proc_entry);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7998fa56282a..01b379310276 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -9,7 +9,7 @@ char *task_mem(struct mm_struct *mm, char *buffer)
 	unsigned long data, text, lib;
 
 	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
-	text = (mm->end_code - mm->start_code) >> 10;
+	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
 	buffer += sprintf(buffer,
 		"VmSize:\t%8lu kB\n"
@@ -18,12 +18,14 @@ char *task_mem(struct mm_struct *mm, char *buffer)
 		"VmData:\t%8lu kB\n"
 		"VmStk:\t%8lu kB\n"
 		"VmExe:\t%8lu kB\n"
-		"VmLib:\t%8lu kB\n",
+		"VmLib:\t%8lu kB\n"
+		"VmPTE:\t%8lu kB\n",
 		(mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
 		mm->locked_vm << (PAGE_SHIFT-10),
 		mm->rss << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
-		mm->stack_vm << (PAGE_SHIFT-10), text, lib);
+		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
+		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
 	return buffer;
 }
 
@@ -36,7 +38,8 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
 	       int *data, int *resident)
 {
 	*shared = mm->shared_vm;
-	*text = (mm->end_code - mm->start_code) >> PAGE_SHIFT;
+	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
+								>> PAGE_SHIFT;
 	*data = mm->total_vm - mm->shared_vm - *text;
 	*resident = mm->rss;
 	return mm->total_vm;
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index cdc2a91609d7..c9f8e2feef10 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -736,7 +736,7 @@ static inline int this_blocknr_allocation_would_make_it_a_large_file(reiserfs_bl
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
 static inline void displace_new_packing_locality (reiserfs_blocknr_hint_t *hint)
 {
-    struct key * key = &hint->key;
+    struct reiserfs_key * key = &hint->key;
 
     hint->th->displace_new_blocks = 0;
     hint->search_start = hint->beg + keyed_hash((char*)(&key->k_objectid),4) % (hint->end - hint->beg);
@@ -777,7 +777,7 @@ static inline int old_way (reiserfs_blocknr_hint_t * hint)
 
 static inline void hundredth_slices (reiserfs_blocknr_hint_t * hint)
 {
-    struct key * key = &hint->key;
+    struct reiserfs_key * key = &hint->key;
     b_blocknr_t slice_start;
 
     slice_start = (keyed_hash((char*)(&key->k_dir_id),4) % 100) * (hint->end / 100);
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index c222a92b5a94..268474f46830 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -12,7 +12,7 @@
 #include <linux/buffer_head.h>
 #include <asm/uaccess.h>
 
-extern struct key  MIN_KEY;
+extern struct reiserfs_key  MIN_KEY;
 
 static int reiserfs_readdir (struct file *, void *, filldir_t);
 int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, int datasync) ;
@@ -46,7 +46,7 @@ static int reiserfs_readdir (struct file * filp, void * dirent, filldir_t filldi
     INITIALIZE_PATH (path_to_entry);
     struct buffer_head * bh;
     int item_num, entry_num;
-    const struct key * rkey;
+    const struct reiserfs_key * rkey;
     struct item_head * ih, tmp_ih;
     int search_res;
     char * local_buf;
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index f038aa7075b2..814561b10324 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -163,7 +163,7 @@ static void create_virtual_node (struct tree_balance * tb, int h)
   
     /* set right merge flag we take right delimiting key and check whether it is a mergeable item */
     if (tb->CFR[0]) {
-	struct key * key;
+	struct reiserfs_key * key;
 
 	key = B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0]);
 	if (op_is_left_mergeable (key, Sh->b_size) && (vn->vn_mode != M_DELETE ||
@@ -1140,7 +1140,7 @@ static inline int can_node_be_removed (int mode, int lfree, int sfree, int rfree
     struct buffer_head * Sh = PATH_H_PBUFFER (tb->tb_path, h);
     int levbytes = tb->insert_size[h];
     struct item_head * ih;
-    struct key * r_key = NULL;
+    struct reiserfs_key * r_key = NULL;
 
     ih = B_N_PITEM_HEAD (Sh, 0);
     if ( tb->CFR[h] )
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
index 5b9dee2ac2ae..d1033e3f89ea 100644
--- a/fs/reiserfs/ibalance.c
+++ b/fs/reiserfs/ibalance.c
@@ -133,7 +133,7 @@ static void internal_insert_childs (struct buffer_info * cur_bi,
     struct buffer_head * cur = cur_bi->bi_bh;
     struct block_head * blkh;
     int nr;
-    struct key * ih;
+    struct reiserfs_key * ih;
     struct disk_child new_dc[2];
     struct disk_child * dc;
     int i;
@@ -209,7 +209,7 @@ static void	internal_delete_pointers_items (
   struct buffer_head * cur = cur_bi->bi_bh;
   int nr;
   struct block_head * blkh;
-  struct key * key;
+  struct reiserfs_key * key;
   struct disk_child * dc;
 
   RFALSE( cur == NULL, "buffer is 0");
@@ -300,7 +300,7 @@ static void internal_copy_pointers_items (
   int nr_dest, nr_src;
   int dest_order, src_order;
   struct block_head * blkh;
-  struct key * key;
+  struct reiserfs_key * key;
   struct disk_child * dc;
 
   nr_src = B_NR_ITEMS (src);
@@ -409,7 +409,7 @@ static void internal_insert_key (struct buffer_info * dest_bi,
     struct buffer_head * dest = dest_bi->bi_bh;
     int nr;
     struct block_head * blkh;
-    struct key * key;
+    struct reiserfs_key * key;
 
     RFALSE( dest == NULL || src == NULL,
 	    "source(%p) or dest(%p) buffer is 0", src, dest);
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
index c315edbb2187..57779620a626 100644
--- a/fs/reiserfs/item_ops.c
+++ b/fs/reiserfs/item_ops.c
@@ -26,7 +26,7 @@ static void sd_decrement_key (struct cpu_key * key)
     set_cpu_key_k_offset(key, (loff_t)(-1));
 }
 
-static int sd_is_left_mergeable (struct key * key, unsigned long bsize)
+static int sd_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize)
 {
     return 0;
 }
@@ -145,7 +145,7 @@ static void direct_decrement_key (struct cpu_key * key)
 }
 
 
-static int direct_is_left_mergeable (struct key * key, unsigned long bsize)
+static int direct_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize)
 {
     int version = le_key_version (key);
     return ((le_key_k_offset (version, key) & (bsize - 1)) != 1);
@@ -250,7 +250,7 @@ static void indirect_decrement_key (struct cpu_key * key)
 
 
 // if it is not first item of the body, then it is mergeable
-static int indirect_is_left_mergeable (struct key * key, unsigned long bsize)
+static int indirect_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize)
 {
     int version = le_key_version (key);
     return (le_key_k_offset (version, key) != 1);
@@ -403,7 +403,7 @@ static void direntry_decrement_key (struct cpu_key * key)
 }
 
 
-static int direntry_is_left_mergeable (struct key * key, unsigned long bsize)
+static int direntry_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize)
 {
     if (le32_to_cpu (key->u.k_offset_v1.k_offset) == DOT_OFFSET)
 	return 0;
@@ -691,7 +691,7 @@ static void errcatch_decrement_key (struct cpu_key * key)
 }
 
 
-static int errcatch_is_left_mergeable (struct key * key, unsigned long bsize)
+static int errcatch_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize)
 {
     reiserfs_warning (NULL, "green-16003: Invalid item type observed, run fsck ASAP");
     return 0;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index e12fcdff5a69..30e19a145fa7 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1166,7 +1166,7 @@ static int entry_points_to_object (const char * name, int len, struct reiserfs_d
 
 
 /* sets key of objectid the entry has to point to */
-static void set_ino_in_dir_entry (struct reiserfs_dir_entry * de, struct key * key)
+static void set_ino_in_dir_entry (struct reiserfs_dir_entry * de, struct reiserfs_key * key)
 {
     /* JDM These operations are endian safe - both are le */
     de->de_deh[de->de_entry_num].deh_dir_id = key->k_dir_id;
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 333ad7c001b3..5f08f356da1a 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -28,7 +28,7 @@ static char * reiserfs_cpu_offset (struct cpu_key * key)
 }
 
 
-static char * le_offset (struct key * key)
+static char * le_offset (struct reiserfs_key * key)
 {
   int version;
 
@@ -57,7 +57,7 @@ static char * cpu_type (struct cpu_key * key)
 }
 
 
-static char * le_type (struct key * key)
+static char * le_type (struct reiserfs_key * key)
 {
     int version;
     
@@ -76,7 +76,7 @@ static char * le_type (struct key * key)
 
 
 /* %k */
-static void sprintf_le_key (char * buf, struct key * key)
+static void sprintf_le_key (char * buf, struct reiserfs_key * key)
 {
   if (key)
     sprintf (buf, "[%d %d %s %s]", le32_to_cpu (key->k_dir_id),
@@ -213,7 +213,7 @@ prepare_error_buf( const char *fmt, va_list args )
 
         switch (what) {
         case 'k':
-            sprintf_le_key (p, va_arg(args, struct key *));
+            sprintf_le_key (p, va_arg(args, struct reiserfs_key *));
             break;
         case 'K':
             sprintf_cpu_key (p, va_arg(args, struct cpu_key *));
@@ -462,7 +462,7 @@ void print_path (struct tree_balance * tb, struct path * path)
    dc_size)...*/
 static int print_internal (struct buffer_head * bh, int first, int last)
 {
-    struct key * key;
+    struct reiserfs_key * key;
     struct disk_child * dc;
     int i;
     int from, to;
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index eae0b197fd34..caea0377b668 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -93,7 +93,7 @@ inline void copy_item_head(struct item_head * p_v_to,
    Returns: -1 if key1 < key2 
    0 if key1 == key2
    1 if key1 > key2 */
-inline int  comp_short_keys (const struct key * le_key, 
+inline int  comp_short_keys (const struct reiserfs_key * le_key,
 			     const struct cpu_key * cpu_key)
 {
   __u32 * p_s_le_u32, * p_s_cpu_u32;
@@ -117,7 +117,7 @@ inline int  comp_short_keys (const struct key * le_key,
    Compare keys using all 4 key fields.
    Returns: -1 if key1 < key2 0
    if key1 = key2 1 if key1 > key2 */
-inline int  comp_keys (const struct key * le_key, const struct cpu_key * cpu_key)
+inline int  comp_keys (const struct reiserfs_key * le_key, const struct cpu_key * cpu_key)
 {
   int retval;
 
@@ -174,7 +174,7 @@ inline int comp_cpu_keys (const struct cpu_key * key1,
     return 0;
 }
 
-inline int comp_short_le_keys (const struct key * key1, const struct key * key2)
+inline int comp_short_le_keys (const struct reiserfs_key * key1, const struct reiserfs_key * key2)
 {
   __u32 * p_s_1_u32, * p_s_2_u32;
   int n_key_length = REISERFS_SHORT_KEY_LEN;
@@ -216,7 +216,7 @@ inline void cpu_key2cpu_key (struct cpu_key * to, const struct cpu_key * from)
 }
 
 
-inline void le_key2cpu_key (struct cpu_key * to, const struct key * from)
+inline void le_key2cpu_key (struct cpu_key * to, const struct reiserfs_key * from)
 {
     to->on_disk_key.k_dir_id = le32_to_cpu (from->k_dir_id);
     to->on_disk_key.k_objectid = le32_to_cpu (from->k_objectid);
@@ -236,9 +236,9 @@ inline void le_key2cpu_key (struct cpu_key * to, const struct key * from)
 
 // this does not say which one is bigger, it only returns 1 if keys
 // are not equal, 0 otherwise
-inline int comp_le_keys (const struct key * k1, const struct key * k2)
+inline int comp_le_keys (const struct reiserfs_key * k1, const struct reiserfs_key * k2)
 {
-    return memcmp (k1, k2, sizeof (struct key));
+    return memcmp (k1, k2, sizeof (struct reiserfs_key));
 }
 
 /**************************************************************************
@@ -272,7 +272,7 @@ inline	int bin_search (
     int   n_rbound, n_lbound, n_j;
 
    for ( n_j = ((n_rbound = p_n_num - 1) + (n_lbound = 0))/2; n_lbound <= n_rbound; n_j = (n_rbound + n_lbound)/2 )
-     switch( COMP_KEYS((struct key *)((char * )p_v_base + n_j * p_n_width), (struct cpu_key *)p_v_key) )  {
+     switch( COMP_KEYS((struct reiserfs_key *)((char * )p_v_base + n_j * p_n_width), (struct cpu_key *)p_v_key) )  {
      case -1: n_lbound = n_j + 1; continue;
      case  1: n_rbound = n_j - 1; continue;
      case  0: *p_n_pos = n_j;     return ITEM_FOUND; /* Key found in the array.  */
@@ -291,17 +291,17 @@ extern struct tree_balance * cur_tb;
 
 
 /* Minimal possible key. It is never in the tree. */
-const struct key  MIN_KEY = {0, 0, {{0, 0},}};
+const struct reiserfs_key  MIN_KEY = {0, 0, {{0, 0},}};
 
 /* Maximal possible key. It is never in the tree. */
-const struct key  MAX_KEY = {0xffffffff, 0xffffffff, {{0xffffffff, 0xffffffff},}};
+const struct reiserfs_key  MAX_KEY = {0xffffffff, 0xffffffff, {{0xffffffff, 0xffffffff},}};
 
 
 /* Get delimiting key of the buffer by looking for it in the buffers in the path, starting from the bottom
    of the path, and going upwards.  We must check the path's validity at each step.  If the key is not in
    the path, there is no delimiting key in the tree (buffer is first or last buffer in tree), and in this
    case we return a special key, either MIN_KEY or MAX_KEY. */
-inline	const struct  key * get_lkey  (
+inline	const struct  reiserfs_key * get_lkey  (
 	                const struct path         * p_s_chk_path,
                         const struct super_block  * p_s_sb
                       ) {
@@ -340,7 +340,7 @@ inline	const struct  key * get_lkey  (
 
 
 /* Get delimiting key of the buffer at the path and its right neighbor. */
-inline	const struct  key * get_rkey  (
+inline	const struct  reiserfs_key * get_rkey  (
 	                const struct path         * p_s_chk_path,
                         const struct super_block  * p_s_sb
                       ) {
@@ -802,7 +802,7 @@ io_error:
 	{
 	    int pos = p_s_last_element->pe_position;
 	    int limit = B_NR_ITEMS(p_s_bh);
-	    struct key *le_key;
+	    struct reiserfs_key *le_key;
 
 	    if (p_s_search_path->reada & PATH_READA_BACK)
 		limit = 0;
@@ -1247,7 +1247,7 @@ void padd_item (char * item, int total_length, int length)
 }
 
 #ifdef REISERQUOTA_DEBUG
-char key2type(struct key *ih)
+char key2type(struct reiserfs_key *ih)
 {
   if (is_direntry_le_key(2, ih))
     return 'd';
@@ -1417,7 +1417,7 @@ int reiserfs_delete_item (struct reiserfs_transaction_handle *th,
 /* this deletes item which never gets split */
 void reiserfs_delete_solid_item (struct reiserfs_transaction_handle *th,
 				 struct inode *inode,
-				 struct key * key)
+				 struct reiserfs_key * key)
 {
     struct tree_balance tb;
     INITIALIZE_PATH (path);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 23dd560a38aa..3e7d7e0845e8 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -106,7 +106,7 @@ void reiserfs_unlockfs(struct super_block *s) {
   reiserfs_allow_writes(s) ;
 }
 
-extern const struct key  MAX_KEY;
+extern const struct reiserfs_key  MAX_KEY;
 
 
 /* this is used to delete "save link" when there are no items of a
@@ -116,7 +116,7 @@ extern const struct key  MAX_KEY;
    protecting unlink is bigger that a key lf "save link" which
    protects truncate), so there left no items to make truncate
    completion on */
-static int remove_save_link_only (struct super_block * s, struct key * key, int oid_free)
+static int remove_save_link_only (struct super_block * s, struct reiserfs_key * key, int oid_free)
 {
     struct reiserfs_transaction_handle th;
     int err;
@@ -140,7 +140,7 @@ static int finish_unfinished (struct super_block * s)
 {
     INITIALIZE_PATH (path);
     struct cpu_key max_cpu_key, obj_key;
-    struct key save_link_key;
+    struct reiserfs_key save_link_key;
     int retval = 0;
     struct item_head * ih;
     struct buffer_head * bh;
@@ -335,7 +335,7 @@ void add_save_link (struct reiserfs_transaction_handle * th,
 int remove_save_link (struct inode * inode, int truncate)
 {
     struct reiserfs_transaction_handle th;
-    struct key key;
+    struct reiserfs_key key;
     int err;
  
     /* we are going to do one balancing only */
diff --git a/fs/select.c b/fs/select.c
index e0a87cb4f733..57e776cafb3b 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -268,8 +268,6 @@ int do_select(int n, fd_set_bits *fds, long *timeout)
 	return retval;
 }
 
-EXPORT_SYMBOL(do_select);
-
 static void *select_bits_alloc(int size)
 {
 	return kmalloc(6 * size, GFP_KERNEL);
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 1001a64b35da..6db62f3aa397 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -368,7 +368,6 @@ parse_options(struct smb_mount_data_kernel *mnt, char *options)
 				&optopt, &optarg, &flags, &value)) > 0) {
 
 		VERBOSE("'%s' -> '%s'\n", optopt, optarg ? optarg : "<none>");
-
 		switch (c) {
 		case 1:
 			/* got a "flag" option */
@@ -383,15 +382,19 @@ parse_options(struct smb_mount_data_kernel *mnt, char *options)
 			break;
 		case 'u':
 			mnt->uid = value;
+			flags |= SMB_MOUNT_UID;
 			break;
 		case 'g':
 			mnt->gid = value;
+			flags |= SMB_MOUNT_GID;
 			break;
 		case 'f':
 			mnt->file_mode = (value & S_IRWXUGO) | S_IFREG;
+			flags |= SMB_MOUNT_FMODE;
 			break;
 		case 'd':
 			mnt->dir_mode = (value & S_IRWXUGO) | S_IFDIR;
+			flags |= SMB_MOUNT_DMODE;
 			break;
 		case 'i':
 			strlcpy(mnt->codepage.local_name, optarg, 
@@ -429,9 +432,9 @@ smb_show_options(struct seq_file *s, struct vfsmount *m)
 		if (mnt->flags & opts[i].flag)
 			seq_printf(s, ",%s", opts[i].name);
 
-	if (mnt->uid != 0)
+	if (mnt->flags & SMB_MOUNT_UID)
 		seq_printf(s, ",uid=%d", mnt->uid);
-	if (mnt->gid != 0)
+	if (mnt->flags & SMB_MOUNT_GID)
 		seq_printf(s, ",gid=%d", mnt->gid);
 	if (mnt->mounted_uid != 0)
 		seq_printf(s, ",mounted_uid=%d", mnt->mounted_uid);
@@ -440,8 +443,10 @@ smb_show_options(struct seq_file *s, struct vfsmount *m)
 	 * Defaults for file_mode and dir_mode are unknown to us; they
 	 * depend on the current umask of the user doing the mount.
 	 */
-	seq_printf(s, ",file_mode=%04o", mnt->file_mode & S_IRWXUGO);
-	seq_printf(s, ",dir_mode=%04o", mnt->dir_mode & S_IRWXUGO);
+	if (mnt->flags & SMB_MOUNT_FMODE)
+		seq_printf(s, ",file_mode=%04o", mnt->file_mode & S_IRWXUGO);
+	if (mnt->flags & SMB_MOUNT_DMODE)
+		seq_printf(s, ",dir_mode=%04o", mnt->dir_mode & S_IRWXUGO);
 
 	if (strcmp(mnt->codepage.local_name, CONFIG_NLS_DEFAULT))
 		seq_printf(s, ",iocharset=%s", mnt->codepage.local_name);
@@ -566,8 +571,13 @@ int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
 		mnt->file_mode = (oldmnt->file_mode & S_IRWXUGO) | S_IFREG;
 		mnt->dir_mode = (oldmnt->dir_mode & S_IRWXUGO) | S_IFDIR;
 
-		mnt->flags = (oldmnt->file_mode >> 9);
+		mnt->flags = (oldmnt->file_mode >> 9) | SMB_MOUNT_UID |
+			SMB_MOUNT_GID | SMB_MOUNT_FMODE | SMB_MOUNT_DMODE;
 	} else {
+		mnt->file_mode = mnt->dir_mode = S_IRWXU | S_IRGRP | S_IXGRP |
+						S_IROTH | S_IXOTH | S_IFREG;
+		mnt->dir_mode = mnt->dir_mode = S_IRWXU | S_IRGRP | S_IXGRP |
+						S_IROTH | S_IXOTH | S_IFDIR;
 		if (parse_options(mnt, raw_data))
 			goto out_bad_option;
 	}
@@ -599,6 +609,7 @@ int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
 	sb->s_root = d_alloc_root(root_inode);
 	if (!sb->s_root)
 		goto out_no_root;
+
 	smb_new_dentry(sb->s_root);
 
 	return 0;
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index 418c0c36ac8b..1780d50afe2d 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -2074,7 +2074,7 @@ out:
 	return result;
 }
 
-void smb_decode_unix_basic(struct smb_fattr *fattr, char *p)
+void smb_decode_unix_basic(struct smb_fattr *fattr, struct smb_sb_info *server, char *p)
 {
 	u64 size, disk_bytes;
 
@@ -2111,8 +2111,17 @@ void smb_decode_unix_basic(struct smb_fattr *fattr, char *p)
 	fattr->f_ctime = smb_ntutc2unixutc(LVAL(p, 16));
 	fattr->f_atime = smb_ntutc2unixutc(LVAL(p, 24));
 	fattr->f_mtime = smb_ntutc2unixutc(LVAL(p, 32));
-	fattr->f_uid = LVAL(p, 40); 
-	fattr->f_gid = LVAL(p, 48); 
+
+	if (server->mnt->flags & SMB_MOUNT_UID)
+		fattr->f_uid = server->mnt->uid;
+	else
+		fattr->f_uid = LVAL(p, 40);
+
+	if (server->mnt->flags & SMB_MOUNT_GID)
+		fattr->f_gid = server->mnt->gid;
+	else
+		fattr->f_gid = LVAL(p, 48);
+
 	fattr->f_mode |= smb_filetype_to_mode(WVAL(p, 56));
 
 	if (S_ISBLK(fattr->f_mode) || S_ISCHR(fattr->f_mode)) {
@@ -2121,10 +2130,19 @@ void smb_decode_unix_basic(struct smb_fattr *fattr, char *p)
 
 		fattr->f_rdev = MKDEV(major & 0xffffffff, minor & 0xffffffff);
 		if (MAJOR(fattr->f_rdev) != (major & 0xffffffff) ||
-		    MINOR(fattr->f_rdev) != (minor & 0xffffffff))
+	    	MINOR(fattr->f_rdev) != (minor & 0xffffffff))
 			fattr->f_rdev = 0;
 	}
+
 	fattr->f_mode |= LVAL(p, 84);
+
+	if ( (server->mnt->flags & SMB_MOUNT_DMODE) &&
+	     (S_ISDIR(fattr->f_mode)) )
+		fattr->f_mode = (server->mnt->dir_mode & (S_IRWXU | S_IRWXG | S_IRWXO)) | S_IFDIR;
+	else if ( (server->mnt->flags & SMB_MOUNT_FMODE) &&
+	          !(S_ISDIR(fattr->f_mode)) )
+		fattr->f_mode = (server->mnt->file_mode & (S_IRWXU | S_IRWXG | S_IRWXO)) | S_IFREG;
+
 }
 
 /*
@@ -2210,7 +2228,7 @@ smb_decode_long_dirent(struct smb_sb_info *server, char *p, int level,
 		/* FIXME: should we check the length?? */
 
 		p += 8;
-		smb_decode_unix_basic(fattr, p);
+		smb_decode_unix_basic(fattr, server, p);
 		VERBOSE("info SMB_FIND_FILE_UNIX at %p, len=%d, name=%.*s\n",
 			p, len, len, qname->name);
 		break;
@@ -2769,7 +2787,7 @@ smb_proc_getattr_unix(struct smb_sb_info *server, struct dentry *dir,
 	if (result < 0)
 		goto out_free;
 
-	smb_decode_unix_basic(attr, req->rq_data);
+	smb_decode_unix_basic(attr, server, req->rq_data);
 
 out_free:
 	smb_rput(req);
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index e8946c455d8b..50df777b6189 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -24,7 +24,7 @@ extern int smb_proc_rmdir(struct dentry *dentry);
 extern int smb_proc_unlink(struct dentry *dentry);
 extern int smb_proc_flush(struct smb_sb_info *server, __u16 fileid);
 extern void smb_init_root_dirent(struct smb_sb_info *server, struct smb_fattr *fattr);
-extern void smb_decode_unix_basic(struct smb_fattr *fattr, char *p);
+extern void smb_decode_unix_basic(struct smb_fattr *fattr, struct smb_sb_info *server, char *p);
 extern int smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr);
 extern int smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr);
 extern int smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, unsigned int major, unsigned int minor);
diff --git a/fs/xattr.c b/fs/xattr.c
index a71900b5349a..93dee70a1dbe 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -5,6 +5,7 @@
 
   Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org>
   Copyright (C) 2001 SGI - Silicon Graphics, Inc <linux-xfs@oss.sgi.com>
+  Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
  */
 #include <linux/fs.h>
 #include <linux/slab.h>
@@ -14,6 +15,7 @@
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/module.h>
 #include <asm/uaccess.h>
 
 /*
@@ -348,3 +350,131 @@ sys_fremovexattr(int fd, char __user *name)
 	fput(f);
 	return error;
 }
+
+
+static const char *
+strcmp_prefix(const char *a, const char *a_prefix)
+{
+	while (*a_prefix && *a == *a_prefix) {
+		a++;
+		a_prefix++;
+	}
+	return *a_prefix ? NULL : a;
+}
+
+/*
+ * In order to implement different sets of xattr operations for each xattr
+ * prefix with the generic xattr API, a filesystem should create a
+ * null-terminated array of struct xattr_handler (one for each prefix) and
+ * hang a pointer to it off of the s_xattr field of the superblock.
+ *
+ * The generic_fooxattr() functions will use this list to dispatch xattr
+ * operations to the correct xattr_handler.
+ */
+#define for_each_xattr_handler(handlers, handler)		\
+		for ((handler) = *(handlers)++;			\
+			(handler) != NULL;			\
+			(handler) = *(handlers)++)
+
+/*
+ * Find the xattr_handler with the matching prefix.
+ */
+static struct xattr_handler *
+xattr_resolve_name(struct xattr_handler **handlers, const char **name)
+{
+	struct xattr_handler *handler;
+
+	if (!*name)
+		return NULL;
+
+	for_each_xattr_handler(handlers, handler) {
+		const char *n = strcmp_prefix(*name, handler->prefix);
+		if (n) {
+			*name = n;
+			break;
+		}
+	}
+	return handler;
+}
+
+/*
+ * Find the handler for the prefix and dispatch its get() operation.
+ */
+ssize_t
+generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size)
+{
+	struct xattr_handler *handler;
+	struct inode *inode = dentry->d_inode;
+
+	handler = xattr_resolve_name(inode->i_sb->s_xattr, &name);
+	if (!handler)
+		return -EOPNOTSUPP;
+	return handler->get(inode, name, buffer, size);
+}
+
+/*
+ * Combine the results of the list() operation from every xattr_handler in the
+ * list.
+ */
+ssize_t
+generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct xattr_handler *handler, **handlers = inode->i_sb->s_xattr;
+	unsigned int size = 0;
+
+	if (!buffer) {
+		for_each_xattr_handler(handlers, handler)
+			size += handler->list(inode, NULL, 0, NULL, 0);
+	} else {
+		char *buf = buffer;
+
+		for_each_xattr_handler(handlers, handler) {
+			size = handler->list(inode, buf, buffer_size, NULL, 0);
+			if (size > buffer_size)
+				return -ERANGE;
+			buf += size;
+			buffer_size -= size;
+		}
+		size = buf - buffer;
+	}
+	return size;
+}
+
+/*
+ * Find the handler for the prefix and dispatch its set() operation.
+ */
+int
+generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags)
+{
+	struct xattr_handler *handler;
+	struct inode *inode = dentry->d_inode;
+
+	if (size == 0)
+		value = "";  /* empty EA, do not remove */
+	handler = xattr_resolve_name(inode->i_sb->s_xattr, &name);
+	if (!handler)
+		return -EOPNOTSUPP;
+	return handler->set(inode, name, value, size, flags);
+}
+
+/*
+ * Find the handler for the prefix and dispatch its set() operation to remove
+ * any associated extended attribute.
+ */
+int
+generic_removexattr(struct dentry *dentry, const char *name)
+{
+	struct xattr_handler *handler;
+	struct inode *inode = dentry->d_inode;
+
+	handler = xattr_resolve_name(inode->i_sb->s_xattr, &name);
+	if (!handler)
+		return -EOPNOTSUPP;
+	return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
+}
+
+EXPORT_SYMBOL(generic_getxattr);
+EXPORT_SYMBOL(generic_listxattr);
+EXPORT_SYMBOL(generic_setxattr);
+EXPORT_SYMBOL(generic_removexattr);